Skip to content

Instantly share code, notes, and snippets.

@danemacaulay
Last active January 4, 2018 19:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danemacaulay/0821867c252380f918e62f67aa9fd620 to your computer and use it in GitHub Desktop.
Save danemacaulay/0821867c252380f918e62f67aa9fd620 to your computer and use it in GitHub Desktop.
Stream through remote common crawl index file to search for WARC entries by URL
import sys
import requests
import zlib
import json
from urllib.parse import urlparse
from collections import Counter
path = sys.argv[1]
url = 'https://commoncrawl.s3.amazonaws.com/{}'.format(path)
google_netloc = 'www.google.com'
google_path = '/maps/place'
with requests.get(url, stream=True) as r:
dec = zlib.decompressobj(32 + zlib.MAX_WBITS) # offset 32 to skip the header
last_line = ''
for chunk in r.iter_content(chunk_size=1024):
if chunk:
decoded = dec.decompress(chunk)
lines = decoded.splitlines()
if len(last_line) and len(lines):
lines[0] = last_line + lines[0]
if len(lines):
last_line = lines[-1]
lines.remove(last_line)
for line in lines:
data = "{" + str(line).split("{")[1].replace("'", '')
try:
parsed = json.loads(data)
parsed_uri = urlparse(parsed['url'])
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
is_maps_path = google_path in parsed_uri.path
is_maps_domain = parsed_uri.netloc == google_netloc
# print(is_maps_path, is_maps_domain, data)
if is_maps_path and is_maps_domain:
print(data)
except ValueError:
pass
@danemacaulay
Copy link
Author

danemacaulay commented Jan 4, 2018

cat cc-index.paths | parallel -j8 " python3 indexer.py {} >> results.txt"

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment