Skip to content

Instantly share code, notes, and snippets.

@r4vi
Created September 21, 2015 09:15
Show Gist options
  • Save r4vi/2c121bdafdb6a063688a to your computer and use it in GitHub Desktop.
Save r4vi/2c121bdafdb6a063688a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from requests_futures.sessions import FuturesSession, bs4"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import H"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"resp = requests.get(BASE_URL + '1')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL = 'http://events.londonopenhouse.org/Venues?q=&Page='"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"MAX_PAGES = 71"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"resp = requests.get(BASE_URL + '1')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import lxml, cssselect, requests"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"resp = requests.get(BASE_URL + '1')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"soup = BeautifulSoup(resp.content)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"souo"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"soup"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"soup.select('.listing-detail')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"len(soup.select('.listing-detail'))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from requests_futures.sessions import FuturesSession"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"session = FuturesSession(max_workers=10)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"urlrs = [x for x in range(MAX_PAGES)]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"urls"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"urls = [x for x in range(MAX_PAGES)]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"urls"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"urls = [x for x in range(MAX_PAGES+1)]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"urls"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"urls = [BASE_URL+str(x) for x in range(MAX_PAGES+1)]"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"urls"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"urls = [BASE_URL+str(x) for x in range(MAX_PAGES+1)]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"resps = [session.get(x) for x in urls]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"resps"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"resps_results = [x.result() for x in resps]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"resp_results"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"resps_results"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"len(resps_results)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"resps_listings = [BeautifulSoup(x.content).select('.listing-detail') for x in resps_results]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"len(resps_listings)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"resps_listings[0]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"listing_details list(itertools.chain.from_iterable(resps_listings))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"listing_details = list(itertools.chain.from_iterable(resps_listings))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"listing_details"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"len(listing_details)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"d1 = listing_details[0]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"d1"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"type(d1)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"d1.select('.address')"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"d1.select_one('address')"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"d1.select_one('address').text"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"{'address': d1.select_one('address').text }"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"{'address': d1.select_one('address').text, 'name': d1.select_one('a').text }"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"{'address': d1.select_one('address').text, 'name': d1.select_one('a').text, 'detail': d1.select_one('a')['href'] }"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"{'address': d1.select_one('address').text, 'name': d1.select_one('a').text, 'detail': d1.select_one('a')['href'] }"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"def tag_to_dict(d1):\n",
" return {'address': d1.select_one('address').text, 'name': d1.select_one('a').text, 'detail': d1.select_one('a')['href'] }"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts = [tag_to_dict(x) for x in listing_details]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"BASE_URL"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"BASE_DETAL_URL = 'http://events.londonopenhouse.org'"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" x['detail'] = BASE_DETAL_URL + x['detail']\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" x['detail_content'] = requests.get(x['detail'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"c1 = listing_details_dicts['detail_content']"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"c1 = listing_details_dicts[0]['detail_content']"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"c1"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"bs = BeautifulSoup(c1.content)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"bs.select('#body')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"bs.select('#body').select_one('div[data-body]')"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"bs.select_one('#body').select_one('div[data-body]')"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"bs.select_one('#body').select_one('div[data-body]').text"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"bs.select_one('#body').select_one('div[data-body]').text.strip()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"{'description': bs.select_one('#body').select_one('div[data-body]').text.strip() }"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"{'description': bs.select_one('#body').select_one('div[data-body]').text.strip(), }"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"{'description': bs.select_one('#body').select_one('div[data-body]').text.strip(),\n",
" 'open_times': ''}"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"body = bs.select_one('#body')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': body.select('a[href^=/venues]') }"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': body.select('a[href^=/venues/accessTimeIds]') }"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': body.select('a[href^=/venues?accessTimeIds]') }"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.text for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.text.strip() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.text.strip() for x in body.select('table tr')] }"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.text.strip() for x in body.select('table tr')[0]] }"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': body.select('table tr') }"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent().text.strip() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent(1) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent().parent() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent() for x in body.select('a[href^=/venues?accessTimeIds]').parent()] }"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.getparent() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [type(x) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent.text for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent.text.strip() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent.text.strip() for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent.text.strip().split('|') for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [y.strip() for y in [x.parent.text.strip().split('|') for x in body.select('a[href^=/venues?accessTimeIds]')]] }"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [y.strip() for y in [x.parent.text.strip().split('|') for x in body.select('a[href^=/venues?accessTimeIds]')]] }"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [x.parent.text.strip().split('|') for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [[y.strip() for y in x.parent.text.strip().split('|')] for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [','.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"{'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"def resp_to_detail(resp):\n",
" bs = bs4(resp.content)\n",
" body = bs.select_one('#body')\n",
" return {'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts['detail_response'])"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_response'])"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_resp'])"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"def resp_to_detail(resp):\n",
" bs = BeautifulSoup(resp.content)\n",
" body = bs.select_one('#body')\n",
" return {'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')] }"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"body"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]')"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.tet"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.text"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.text.strip()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td')"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:list')"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:last')"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(1)')"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)')"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)').text"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)').text.strip()"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"def resp_to_detail(resp):\n",
" bs = BeautifulSoup(resp.content)\n",
" body = bs.select_one('#body')\n",
" return {'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')], \n",
" 'last_entry': In [122]: body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)').text.strip()\n",
"} "
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"def resp_to_detail(resp):\n",
" bs = BeautifulSoup(resp.content)\n",
" body = bs.select_one('#body')\n",
" return {'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')], \n",
" 'last_entry': body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)').text.strip()\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[0]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" d2 = resp_to_detail(x['detail_content'])\n",
" x.update(d2)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" print resp_to_detail(x['detail_content'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" print(resp_to_detail(x['detail_content']))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[1]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" d2 = resp_to_detail(x['detail_content'])\n",
" x.update(d2)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"import simplekml"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"import geopy"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"def resp_to_detail(resp):\n",
" bs = BeautifulSoup(resp.content)\n",
" body = bs.select_one('#body')\n",
" return {'description': body.select_one('div[data-body]').text.strip(),\n",
" 'open_times': [', '.join([y.strip() for y in x.parent.text.strip().split('|')]) for x in body.select('a[href^=/venues?accessTimeIds]')], \n",
" #'last_entry': body.select_one('label[for=Venue_LastEntryTime]').parent.parent.select_one('td:nth-of-type(2)').text.strip()\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [],
"source": [
"resp_to_detail(listing_details_dicts[1]['detail_content'])"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" d2 = resp_to_detail(x['detail_content'])\n",
" x.update(d2)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"from geopy.geocoders import Nominatim"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"geolocator = Nominatim()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"geolocator.geocode(listing_details_dicts[0]['address'])"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [],
"source": [
"geolocator.geocode(listing_details_dicts[0]['address'])"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"geolocator = Nominatim(timeout=5)"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [],
"source": [
"x for x in listing_details_dicts if 'loc' not in x"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [],
"source": [
"[x for x in listing_details_dicts if 'loc' not in x]"
]
},
{
"cell_type": "code",
"execution_count": 150,
"metadata": {},
"outputs": [],
"source": [
"len([x for x in listing_details_dicts if 'loc' not in x])"
]
},
{
"cell_type": "code",
"execution_count": 151,
"metadata": {},
"outputs": [],
"source": [
"len([x for x in listing_details_dicts if 'loc' in x])"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: print x['address']; x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: print(x['address']); x['loc'] = geolocator.geocode(x['address'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: print(x['address']); try: x['loc'] = geolocator.geocode(x['address']); except: pass;\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' not in x: print(x['address'])\n",
" try:\n",
" x['loc'] = geolocator.geocode(x['address'])\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"len([x for x in listing_details_dicts if 'loc' in x])"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0].loc"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]['loc']"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]['loc'].point"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"dir(listing_details_dicts[0]['loc'].point)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]['loc'].point"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]['loc'].point[0]"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]['loc'].point[1]"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"kml.newpoint?"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
"kml.newpoint??"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [],
"source": [
"kml.newpoint???"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"kml.newpoint??"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc):\n",
" doc\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" doc.newpoint(name=item['name'])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + '\\n' + item['detail'] + '\\n'.join(item['open_times']) )"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + '\\n' + item['detail'] + '\\n'.join(item['open_times']), coords=item['loc'].point )"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [],
"source": [
"make_point(kml, listing_details_dicts[0])"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [],
"source": [
"len(listing_details_dicts[0]['loc'].point)"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + '\\n' + item['detail'] + '\\n'.join(item['open_times']), coords=item['loc'].point.long )"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
"p = listing_details_dicts[0]['loc'].point"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [],
"source": [
"p.longitude"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [],
"source": [
"p.latitude"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + '\\n' + item['detail'] + '\\n'.join(item['open_times']), coords=(item['loc'].point.longitude, item['loc'].point.latitude ,) )"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [],
"source": [
"make_point(kml, listing_details_dicts[0])"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + '\\n' + item['detail'] + '\\n'.join(item['open_times']), coords=[(item['loc'].point.longitude, item['loc'].point.latitude ,)] )"
]
},
{
"cell_type": "code",
"execution_count": 185,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 186,
"metadata": {},
"outputs": [],
"source": [
"kml"
]
},
{
"cell_type": "code",
"execution_count": 187,
"metadata": {},
"outputs": [],
"source": [
"make_point(kml, listing_details_dicts[0])"
]
},
{
"cell_type": "code",
"execution_count": 188,
"metadata": {},
"outputs": [],
"source": [
"make_point(kml, listing_details_dicts[0])"
]
},
{
"cell_type": "code",
"execution_count": 189,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 190,
"metadata": {},
"outputs": [],
"source": [
"p"
]
},
{
"cell_type": "code",
"execution_count": 191,
"metadata": {},
"outputs": [],
"source": [
"p.longitude"
]
},
{
"cell_type": "code",
"execution_count": 192,
"metadata": {},
"outputs": [],
"source": [
"kml = None"
]
},
{
"cell_type": "code",
"execution_count": 193,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 194,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" make_point(kml, x)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 195,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" make_point(kml, x)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 196,
"metadata": {},
"outputs": [],
"source": [
"kml"
]
},
{
"cell_type": "code",
"execution_count": 197,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 198,
"metadata": {},
"outputs": [],
"source": [
"kml = None"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" try:\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 202,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" try:\n",
" make_point(kml, x)\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 203,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 204,
"metadata": {},
"outputs": [],
"source": [
"kml.save('/tmp/openhouse2015.kml')"
]
},
{
"cell_type": "code",
"execution_count": 205,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + ' \\n ' + item['detail'] + ' \\n'.join(item['open_times']), coords=[(item['loc'].point.longitude, item['loc'].point.latitude ,)] )"
]
},
{
"cell_type": "code",
"execution_count": 206,
"metadata": {},
"outputs": [],
"source": [
"kml.kml()"
]
},
{
"cell_type": "code",
"execution_count": 207,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 208,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" try:\n",
" make_point(kml, x)\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {},
"outputs": [],
"source": [
"kml.save('/tmp/openhouse2015.kml')"
]
},
{
"cell_type": "code",
"execution_count": 210,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + ' \\n'.join(item['open_times'] + ' \\n ' + item['detail']), coords=[(item['loc'].point.longitude, item['loc'].point.latitude ,)] )"
]
},
{
"cell_type": "code",
"execution_count": 211,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 212,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" try:\n",
" make_point(kml, x)\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {},
"outputs": [],
"source": [
"kml.save('/tmp/openhouse2015.kml')"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [],
"source": [
"def make_point(doc, item):\n",
" return doc.newpoint(name=item['name'], description=item['description'] + ' \\n'.join(item['open_times']) + ' \\n ' + item['detail'], coords=[(item['loc'].point.longitude, item['loc'].point.latitude ,)] )"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'loc' in x:\n",
" try:\n",
" make_point(kml, x)\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {},
"outputs": [],
"source": [
"kml.save('/tmp/openhouse2015.kml')"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [],
"source": [
"sunday_listings = []"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [],
"source": [
"listing_details_dicts[0]"
]
},
{
"cell_type": "code",
"execution_count": 220,
"metadata": {},
"outputs": [],
"source": [
"'Sunday' in listing_details_dicts[0]['opening_times']"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [],
"source": [
"'Sunday' in listing_details_dicts[0]['open_times']"
]
},
{
"cell_type": "code",
"execution_count": 222,
"metadata": {},
"outputs": [],
"source": [
"'Sunday' in ' '.join(listing_details_dicts[0]['open_times'])"
]
},
{
"cell_type": "code",
"execution_count": 223,
"metadata": {},
"outputs": [],
"source": [
"'Sunday' in ' '.join(listing_details_dicts[0]['open_times']).lower()"
]
},
{
"cell_type": "code",
"execution_count": 224,
"metadata": {},
"outputs": [],
"source": [
"'sunday' in ' '.join(listing_details_dicts[0]['open_times']).lower()"
]
},
{
"cell_type": "code",
"execution_count": 225,
"metadata": {},
"outputs": [],
"source": [
"sunday_listings"
]
},
{
"cell_type": "code",
"execution_count": 226,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" print('x')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 227,
"metadata": {},
"outputs": [],
"source": [
"for x in listing_details_dicts:\n",
" if 'sunday' in ' '.join(x['open_times']).lower():\n",
" sunday_listings.append(x)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 228,
"metadata": {},
"outputs": [],
"source": [
"kml = simplekml.Kml()"
]
},
{
"cell_type": "code",
"execution_count": 229,
"metadata": {},
"outputs": [],
"source": [
"for x in sunday_listings:\n",
" if 'loc' in x:\n",
" try:\n",
" make_point(kml, x)\n",
" except:\n",
" pass\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 230,
"metadata": {},
"outputs": [],
"source": [
"kml.save('/tmp/open_sunday.kml')"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment