Skip to content

Instantly share code, notes, and snippets.

@weiglemc
Created July 4, 2023 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Save weiglemc/1fb86319177f98eb91e920155da720b5 to your computer and use it in GitHub Desktop.
Python script to grab data from the Internet Archive via the CDX API server, uses function from Sawood Alam's CDXSummary tool
# grab-cdx.py
from requests import Session
from rich.console import Console
from urllib.parse import urlencode
URIR = "https://www.cnn.com/"
FROM = "20150424"
TO = "20220923"
OTHER_PARAMS = "&from=" + FROM + "&to=" + TO + "&collapse=timestamp:8&filter=statuscode:200" # only one entry per day, 200 OK
REQSESSION = Session()
errprint = Console(stderr=True, style="red", highlight=False).print
# HELPFUL FUNCTION FROM CDXSUMMARY
# https://github.com/internetarchive/cdx-summary/blob/main/cdxsummary/__main__.py
def get_stream_from_api(url):
pages = int(REQSESSION.get(f"{url}&showNumPages=true").text)
for page in range(pages):
pageurl = f"{url}&page={page}"
errprint(f"Downloading [[cyan]{page + 1}/{pages}[/cyan]]: [magenta]{pageurl}[/magenta]")
r = REQSESSION.get(pageurl, stream=True)
if r.ok:
r.raw.decode_content = True
for line in r.raw: yield line
def write_cdx (urir, cdxapi, params, outfile):
url = f"{cdxapi}?{params}&{urlencode({'url': urir})}"
input_stream = get_stream_from_api(url)
f = open(outfile, "w")
for line in input_stream: f.write(line.decode())
f.close()
try: input_stream.close()
except: pass
# MAIN
cdxapi = "https://web.archive.org/cdx/search"
params = "matchType=exact" + OTHER_PARAMS
outfile = "cnn-" + FROM + "-" + TO + "-day.cdx"
write_cdx(URIR, cdxapi, params, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment