danemacaulay · January 4, 2018 19:11 · danemacaulay · Jan 4, 2018
diff --git a/indexer.py b/indexer.py
 import sys
 import requests
 import zlib
 import json
 from urllib.parse import urlparse
 from collections import Counter
 path = sys.argv[1]
 url = 'https://commoncrawl.s3.amazonaws.com/{}'.format(path)
 google_netloc = 'www.google.com'
 google_path = '/maps/place'

 with requests.get(url, stream=True) as r:
    dec = zlib.decompressobj(32 + zlib.MAX_WBITS)  # offset 32 to skip the header
    last_line = ''
    for chunk in r.iter_content(chunk_size=1024):
        if chunk:
            decoded = dec.decompress(chunk)
            lines = decoded.splitlines()
            if len(last_line) and len(lines):
                lines[0] = last_line + lines[0]
            if len(lines):
                last_line = lines[-1]
                lines.remove(last_line)
            for line in lines:
                data = "{" + str(line).split("{")[1].replace("'", '')
                try:
                    parsed = json.loads(data)
                    parsed_uri = urlparse(parsed['url'])
                    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                    is_maps_path = google_path in parsed_uri.path
                    is_maps_domain = parsed_uri.netloc == google_netloc
                    # print(is_maps_path, is_maps_domain, data)
                    if is_maps_path and is_maps_domain:
                        print(data)
                except ValueError:
                    pass
	import sys
	import requests
	import zlib
	import json
	from urllib.parse import urlparse
	from collections import Counter
	path = sys.argv[1]
	url = 'https://commoncrawl.s3.amazonaws.com/{}'.format(path)
	google_netloc = 'www.google.com'
	google_path = '/maps/place'

	with requests.get(url, stream=True) as r:
	dec = zlib.decompressobj(32 + zlib.MAX_WBITS) # offset 32 to skip the header
	last_line = ''
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	decoded = dec.decompress(chunk)
	lines = decoded.splitlines()
	if len(last_line) and len(lines):
	lines[0] = last_line + lines[0]
	if len(lines):
	last_line = lines[-1]
	lines.remove(last_line)
	for line in lines:
	data = "{" + str(line).split("{")[1].replace("'", '')
	try:
	parsed = json.loads(data)
	parsed_uri = urlparse(parsed['url'])
	domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
	is_maps_path = google_path in parsed_uri.path
	is_maps_domain = parsed_uri.netloc == google_netloc
	# print(is_maps_path, is_maps_domain, data)
	if is_maps_path and is_maps_domain:
	print(data)
	except ValueError:
	pass