Skip to content

Instantly share code, notes, and snippets.

@AlessandraSozzi
Last active July 6, 2016 10:09
Show Gist options
  • Save AlessandraSozzi/8c61b138d7386ef3456c2fa25651d612 to your computer and use it in GitHub Desktop.
Save AlessandraSozzi/8c61b138d7386ef3456c2fa25651d612 to your computer and use it in GitHub Desktop.
Script to scrape data from i-know-uk.com
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
def load_website(u):
driver_path = '/Users/Alessandra/Documents/WebDriver/chromedriver'
d = webdriver.Chrome(driver_path)
d.get(u)
return d
def scrape_page(d):
html = d.page_source;
soup = BeautifulSoup(html, "lxml")
return soup
def load_whole_page(url, id_elem):
driver = load_website(url)
for n in range(400):
e = driver.find_element_by_id(id_elem)
e.click()
time.sleep(random.random() * 60)
if e.is_displayed():
print "Element found page %d" %(n)
else:
print "Element not found"
break
res = scrape_page(driver)
driver.close()
return res
touring_camping = {'url': 'https://www.iknow-uk.com/uk/touring-camping/', 'title' : 'touring-camping' }
caravan_parks = {'url': 'https://www.iknow-uk.com/uk/caravan-parks/', 'title' : 'caravan-parks' }
holiday_cottages = {'url': 'https://www.iknow-uk.com/uk/holiday-cottages/', 'title' : 'holiday-cottages' }
flats_apartments = {'url': 'https://www.iknow-uk.com/uk/flats-apartments/', 'title' : 'flats-apartments' }
guest_houses = {'url': 'https://www.iknow-uk.com/uk/guest-houses/', 'title' : 'guest-houses' }
hotels = {'url': 'https://www.iknow-uk.com/uk/hotels/', 'title' : 'hotels' }
pub_inns = {'url': 'https://www.iknow-uk.com/uk/pub-inns/', 'title' : 'pub-inns' }
lodges_cabins = {'url': 'https://www.iknow-uk.com/uk/lodges-cabins/', 'title' : 'lodges-cabins' }
pages = [touring_camping, caravan_parks, holiday_cottages, flats_apartments, guest_houses, hotels, pub_inns, lodges_cabins]
for page in pages:
content = load_whole_page(page['url'], 'moreResultsButton')
list_li = content.find_all('div', attrs={'class':'propertySummary'})
df = pd.DataFrame(columns = ['name', 'location', 'sleeps', 'price_info_text', 'summary_text'])
for li in list_li:
firstSection = li.find('div', attrs={'class':'firstSection cf'})
nameContainer = firstSection.find('div', attrs={'class':'nameContainer'})
name = nameContainer.find('div', attrs={'class':'name'}).text.replace('\n',' ').strip()
location = nameContainer.find('div', attrs={'class':'address'}).text.replace('\n',' ').strip()
try:
sleeps = nameContainer.find('div', attrs={'class':'sleeps'}).text.replace('\n',' ').strip()
except:
sleeps = ''
price = firstSection.find('div', attrs={'class':'price'}).text.replace('\n',' ').strip()
price = ' '.join(price.split())
summaryText = li.find('div', attrs={'class':'summaryText'}).text.replace('\n',' ').strip()
df.loc[len(df)] = [name, location, sleeps, price, summaryText]
df.to_csv(page['title'] + '.csv', index = False, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment