Last active
January 4, 2018 19:11
-
-
Save danemacaulay/0821867c252380f918e62f67aa9fd620 to your computer and use it in GitHub Desktop.
Stream through remote common crawl index file to search for WARC entries by URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
import zlib | |
import json | |
from urllib.parse import urlparse | |
from collections import Counter | |
path = sys.argv[1] | |
url = 'https://commoncrawl.s3.amazonaws.com/{}'.format(path) | |
google_netloc = 'www.google.com' | |
google_path = '/maps/place' | |
with requests.get(url, stream=True) as r: | |
dec = zlib.decompressobj(32 + zlib.MAX_WBITS) # offset 32 to skip the header | |
last_line = '' | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
decoded = dec.decompress(chunk) | |
lines = decoded.splitlines() | |
if len(last_line) and len(lines): | |
lines[0] = last_line + lines[0] | |
if len(lines): | |
last_line = lines[-1] | |
lines.remove(last_line) | |
for line in lines: | |
data = "{" + str(line).split("{")[1].replace("'", '') | |
try: | |
parsed = json.loads(data) | |
parsed_uri = urlparse(parsed['url']) | |
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) | |
is_maps_path = google_path in parsed_uri.path | |
is_maps_domain = parsed_uri.netloc == google_netloc | |
# print(is_maps_path, is_maps_domain, data) | |
if is_maps_path and is_maps_domain: | |
print(data) | |
except ValueError: | |
pass | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cat cc-index.paths | parallel -j8 " python3 indexer.py {} >> results.txt"