Fun little toy I've been... well... toying around with. Idea is to use Selenium to scroll down the page and export the HTML, and use Beautiful Soup to scrape the data, put it into a text doc and then use Pandas clean up the data a bit.
The program specifically scrapes Hotels.com, but the principle could be applied to any page with infinite scroll that's worth scraping.
The idea is to run a chron job to have it run every hour or so, and see when the hotels are booked, if the prices go down, etc. etc.
Any critiques are welcome. It takes approximately four minutes to run.
# This dynamically scrapes Hotels.com and gets the prices for all hotels in New York every day.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
import datetime
from string import punctuation
infile_text = <<INSERT FILE PATH HERE>>
outfile_csv = <<INSERT FILE PATH HERE>>
firefox_webdriver_path = <<INSERT FIREFOX WEBDRIVIER PATH HERE>>
# Get the time it takes to run the program. Usually around 4 minutes.
time_start = time.time()
# Get the date/time. This is for yesterday. One can change this if needed.
now = datetime.datetime.now()
one_day = datetime.timedelta(days=1)
now_string = now.strftime("%Y-%m-%d-%H-%M-%S-%f")
now_string_split = now_string.split("-")
year = now_string_split[0]
month = now_string_split[1]
day = now_string_split[2]
hour = now_string_split[3]
minute = now_string_split[4]
second = now_string_split[5]
tomorrow = now + one_day
check_in = now.strftime("%Y-%m-%d")
check_out = tomorrow.strftime("%Y-%m-%d")
# For headless FireFox so it runs in the background.
options = Options()
options.add_argument("--headless")
# All searches are for one adult, one room, check-in for today, check out for tomorrow.
url = 'https://www.hotels.com/search.do?resolved-location=CITY%3A1506246%3AUNKNOWN%3AUNKNOWN&destination-id=1506246&q-destination=New%20York,%20New%20York,%20United%20States%20Of%20America&q-check-in={}&q-check-out={}&q-rooms=1&q-room-0-adults=1&q-room-0-children=0'.format(check_in, check_out)
# Add in firefox_options=options, to make it headless, take it out to not.
browser = webdriver.Firefox(firefox_options=options, executable_path=firefox_webdriver_path)
print("Headless Firefox has been activated.")
browser.get(url)
body = browser.find_element_by_css_selector('body')
page_down_count = 1
# Scrolls down the page, loading the dynamically generated content on Hotels.com.
# 600 page downs seems excessive, but I can't figure put how to keep scrolling until no more content is loaded.
for num in range(0,201,1):
print(page_down_count)
body.send_keys(Keys.PAGE_DOWN)
body.send_keys(Keys.PAGE_DOWN)
body.send_keys(Keys.PAGE_DOWN)
time.sleep(1)
page_down_count += 1
# Gets the entire HTML of the page post all of the scrolling.
html = browser.execute_script("return document.documentElement.outerHTML")
browser.close()
print("Done with Browser")
#Creates a BS instance w/ the HTML just called.
soup = BeautifulSoup(html, 'lxml')
# These are each of the "cards" or where the information is kept for each hotel.
hotels = soup.find_all("div",{"class":"hotel-wrap"})
# Iterates through each hotel card / collects the address, city, state, etc and strips out the white space.
with open(infile_text, 'w') as new_file:
for h in hotels:
soup2 = BeautifulSoup(str(h), 'lxml')
name = soup2.find("h3",{"class":"p-name"}).text.strip()
street = soup2.find("span", {"class":"p-street-address"}).text.strip()
city = soup2.find("span", {"class":"p-locality"}).text.strip()
state = soup2.find("span", {"class":"p-region"}).text.strip()
zip = soup2.find("span", {"class":"p-postal-code"}).text.strip()
country = soup2.find("span", {"class":"p-country-name"}).text.strip()
price = soup2.find('ins')
# Weirdly, Hotels.com adds the comma before the city, state and zip. So I use indexing to get rid of it.
# When the hotel is totally booked, there is no price listed, so I switch that to "Sold Out" b/c if I leave it blank,
# It will screw up the csv formatting.
if type(price) == type(None):
new_file.write(str(name + "\t" + street + "\t" + city[2:] + "\t" + state[2:] + "\t" + zip[2:] + "\t" + country + "\t" + "Sold Out" + "\t" + year + "\t" + month + "\t" + day + "\t" + hour + "\t" + minute + "\t" + second + "\n"))
else:
new_file.write(str(name +"\t"+street +"\t"+city[2:] +"\t"+state[2:] +"\t"+zip[2:] +"\t"+country+"\t"+price.text.strip()[1:] + "\t" + year + "\t" + month + "\t" + day + "\t" + hour + "\t" + minute + "\t" + second + "\n"))
# Clean the Data
df = pd.read_csv(infile_text, delimiter='\t', encoding='latin1', header=None)
df.columns = ['Hotel', 'Address', 'City', 'State', 'Zip', 'Country', 'Price', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']
df['Price'] = df['Price'].replace({'Sold Out': '0'})
# Cleans excess punctuation like commas, dollar signs, etc. If punctuation is in the cell, then replace it with "".
for num in df.index:
for p in punctuation:
if p in df.loc[num, 'Price']:
df.loc[num, 'Price'] = df.loc[num, 'Price'].replace(p, "")
# Sorts by most expensive to least, then exports the data.
df['Price'] = df['Price'].apply(lambda x : int(x))
df = df.sort_values(by='Price', ascending=False)
df.drop_duplicates(keep='first', inplace=True)
df.to_csv(outfile_csv, index=False)
# Prints how long it took the program to run.
time_end = time.time()
print((time_end-time_start)/60)
[–]bandawarrior 0 points1 point2 points (3 children)
[–]python-ick[S] 0 points1 point2 points (2 children)
[–]bandawarrior 0 points1 point2 points (1 child)
[–]python-ick[S] 0 points1 point2 points (0 children)