Skip to content

Instantly share code, notes, and snippets.

@dfm
Last active May 30, 2021 18:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dfm/57cb38d2805f0ce9de9467f3620b9df4 to your computer and use it in GitHub Desktop.
Save dfm/57cb38d2805f0ce9de9467f3620b9df4 to your computer and use it in GitHub Desktop.
A script to download a mirror copy of all of the generated docs for a ReadTheDocs project. httrack must be installed for this to work and you'll need to provide an API token for RTDs.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import time
import json
import argparse
import requests
import tempfile
from pathlib import Path
from shutil import copytree
from subprocess import check_call
start = "<!-- RTD Extra Head -->"
end = "<!-- end RTD <extrahead> -->"
RTDS_EXTRA = re.compile(f"{start}.*{end}", re.S | re.M)
def get_versions(url, token=None):
if token is None:
token = os.environ["RTDS_API_TOKEN"]
if not token:
raise RuntimeError(
"A RTDs API token must be provided using the 'RTDS_API_TOKEN' "
"environment variable or the '--token' command line argument"
)
params = dict(active=True, built=True, limit=100)
headers = {"Authorization": f"Token {token}"}
r = requests.get(url, params=params, headers=headers)
r.raise_for_status()
data = r.json()
link = data.get("next", None)
if link:
return data.get("results", []) + get_versions(link)
return data.get("results", [])
def remove_rtds_extrahead(filename):
with open(filename, "r") as f:
txt = RTDS_EXTRA.sub("", f.read())
with open(filename, "w") as f:
f.write(txt)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("project", help="The name of the RTDs project")
parser.add_argument("--token", help="Your token for the RTDs API")
parser.add_argument(
"--sleep", default=20, type=int,
help="The time (in seconds) to wait between versions"
)
args = parser.parse_args()
target = Path("mirror")
versions = get_versions(
f"https://readthedocs.org/api/v3/projects/{args.project}/versions/",
token=args.token,
)
database = {"versions": [], "aliases": {}}
for version in versions:
slug = version["slug"]
path = target / slug
if slug not in ["latest", "stable"]:
database["versions"].append(
{
"ref": f"refs/tags/{slug}",
"version": slug,
"name": slug,
"path": slug,
"active": True,
}
)
if path.exists():
print(f"Skipping {slug}")
continue
with tempfile.TemporaryDirectory() as dir:
url = version["urls"]["documentation"]
check_call([
"httrack",
f'"{version["urls"]["documentation"]}"',
"-O",
f'"{dir}"',
"-%v",
])
copytree(Path(dir) / url.split("//")[1], path)
print("Removing RTDs extra headers")
for filename in path.rglob("*.html"):
print(f"Cleaning {filename}")
remove_rtds_extrahead(filename)
print(f"Mirrored {slug}, waiting...")
time.sleep(args.sleep)
with open(target / "unladen.json", "w") as f:
json.dump(database, f, indent=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment