Skip to content

Instantly share code, notes, and snippets.

@danemacaulay
Created January 4, 2018 15:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danemacaulay/6e6f09406a762702b3ddbda6cf843d64 to your computer and use it in GitHub Desktop.
Save danemacaulay/6e6f09406a762702b3ddbda6cf843d64 to your computer and use it in GitHub Desktop.
Direct to STDOUT all warc data on a particular domain using index.commoncrawl.org
import gzip
import json
import requests
from StringIO import StringIO
def get_page_count(searchString):
url = 'http://index.commoncrawl.org/CC-MAIN-2017-51-index?url={}&output=json&showNumPages=true'.format(searchString)
resp = requests.get(url)
return json.loads(resp.content)['pages']
def search(searchString, page_number):
url = 'http://index.commoncrawl.org/CC-MAIN-2017-51-index?url={}&output=json&page={}'.format(searchString, page_number)
resp = requests.get(url)
pages = [json.loads(x) for x in resp.content.strip().split('\n')]
return pages
def get_warc(page):
offset, length = int(page['offset']), int(page['length'])
offset_end = offset + length - 1
prefix = 'https://commoncrawl.s3.amazonaws.com/'
url = prefix + page['filename']
resp = requests.get(url, headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})
raw_data = StringIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
data = f.read()
print data
def get_all_warcs(searchString):
page_count = get_page_count(searchString)
for page_number in xrange(0, page_count):
page_data_list = search(searchString, page_number)
for page_data in page_data_list:
get_warc(page_data)
get_all_warcs('facebook.com/*')
@danemacaulay
Copy link
Author

python2 warc_fetcher.py > facebook.warc

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment