Last active
July 6, 2016 10:09
-
-
Save AlessandraSozzi/8c61b138d7386ef3456c2fa25651d612 to your computer and use it in GitHub Desktop.
Script to scrape data from i-know-uk.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from bs4 import BeautifulSoup | |
import time | |
import pandas as pd | |
def load_website(u): | |
driver_path = '/Users/Alessandra/Documents/WebDriver/chromedriver' | |
d = webdriver.Chrome(driver_path) | |
d.get(u) | |
return d | |
def scrape_page(d): | |
html = d.page_source; | |
soup = BeautifulSoup(html, "lxml") | |
return soup | |
def load_whole_page(url, id_elem): | |
driver = load_website(url) | |
for n in range(400): | |
e = driver.find_element_by_id(id_elem) | |
e.click() | |
time.sleep(random.random() * 60) | |
if e.is_displayed(): | |
print "Element found page %d" %(n) | |
else: | |
print "Element not found" | |
break | |
res = scrape_page(driver) | |
driver.close() | |
return res | |
touring_camping = {'url': 'https://www.iknow-uk.com/uk/touring-camping/', 'title' : 'touring-camping' } | |
caravan_parks = {'url': 'https://www.iknow-uk.com/uk/caravan-parks/', 'title' : 'caravan-parks' } | |
holiday_cottages = {'url': 'https://www.iknow-uk.com/uk/holiday-cottages/', 'title' : 'holiday-cottages' } | |
flats_apartments = {'url': 'https://www.iknow-uk.com/uk/flats-apartments/', 'title' : 'flats-apartments' } | |
guest_houses = {'url': 'https://www.iknow-uk.com/uk/guest-houses/', 'title' : 'guest-houses' } | |
hotels = {'url': 'https://www.iknow-uk.com/uk/hotels/', 'title' : 'hotels' } | |
pub_inns = {'url': 'https://www.iknow-uk.com/uk/pub-inns/', 'title' : 'pub-inns' } | |
lodges_cabins = {'url': 'https://www.iknow-uk.com/uk/lodges-cabins/', 'title' : 'lodges-cabins' } | |
pages = [touring_camping, caravan_parks, holiday_cottages, flats_apartments, guest_houses, hotels, pub_inns, lodges_cabins] | |
for page in pages: | |
content = load_whole_page(page['url'], 'moreResultsButton') | |
list_li = content.find_all('div', attrs={'class':'propertySummary'}) | |
df = pd.DataFrame(columns = ['name', 'location', 'sleeps', 'price_info_text', 'summary_text']) | |
for li in list_li: | |
firstSection = li.find('div', attrs={'class':'firstSection cf'}) | |
nameContainer = firstSection.find('div', attrs={'class':'nameContainer'}) | |
name = nameContainer.find('div', attrs={'class':'name'}).text.replace('\n',' ').strip() | |
location = nameContainer.find('div', attrs={'class':'address'}).text.replace('\n',' ').strip() | |
try: | |
sleeps = nameContainer.find('div', attrs={'class':'sleeps'}).text.replace('\n',' ').strip() | |
except: | |
sleeps = '' | |
price = firstSection.find('div', attrs={'class':'price'}).text.replace('\n',' ').strip() | |
price = ' '.join(price.split()) | |
summaryText = li.find('div', attrs={'class':'summaryText'}).text.replace('\n',' ').strip() | |
df.loc[len(df)] = [name, location, sleeps, price, summaryText] | |
df.to_csv(page['title'] + '.csv', index = False, encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment