Skip to content

Instantly share code, notes, and snippets.

@curran
Created December 17, 2022 15:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save curran/2d8d9da4c59190ebd564da9df8f24e9e to your computer and use it in GitHub Desktop.
Save curran/2d8d9da4c59190ebd564da9df8f24e9e to your computer and use it in GitHub Desktop.
Gist Scraping Script Prototype

A working proof of concept that can iterate over each file of each Gist for a given username.

// https://github.com/octokit/octokit.js
import { Octokit } from 'octokit';
// import users from 'users-combined.csv';
const users = [{ username: 'curran' }];
//const octokit = new Octokit({ auth: `personal-access-token123` });
const octokit = new Octokit();
// TODO generate actual ID.
const generateId = () =>
(Math.random() + '').replace('.', '');
// TODO generate actual file ID.
const generateFileId = generateId;
// TODO actually get the right user
const getVizHubUserIdFromGitHubUsername = (username) =>
username + generateId();
// TODO implement this.
// Sample input: "2022-06-05T13:33:26Z"
// Sample output: ? (VizHub timestamp format, integer of seconds)
const gistDateToTimestamp = (gistDate) => gistDate;
const migrate = async () => {
for (const user of users) {
const result = await octokit.request(
'GET /users/{username}/gists',
user
);
for (const gist of result.data) {
// Example value for gist:
// {
// "url": "https://api.github.com/gists/a6c261aca1a12452111cb1b797c04d70",
// "forks_url": "https://api.github.com/gists/a6c261aca1a12452111cb1b797c04d70/forks",
// "commits_url": "https://api.github.com/gists/a6c261aca1a12452111cb1b797c04d70/commits",
// "id": "a6c261aca1a12452111cb1b797c04d70",
// "node_id": "G_kwDOAAELQNoAIGE2YzI2MWFjYTFhMTI0NTIxMTFjYjFiNzk3YzA0ZDcw",
// "git_pull_url": "https://gist.github.com/a6c261aca1a12452111cb1b797c04d70.git",
// "git_push_url": "https://gist.github.com/a6c261aca1a12452111cb1b797c04d70.git",
// "html_url": "https://gist.github.com/a6c261aca1a12452111cb1b797c04d70",
// "files": {
// "App.js": {
// "filename": "App.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/60cfbd370a5c27328388658b0eb3213d29916959/App.js",
// "size": 268
// },
// "README.md": {
// "filename": "README.md",
// "type": "text/markdown",
// "language": "Markdown",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/4e8976a0ce82ac714e8fb3cd6e62856d4d604a26/README.md",
// "size": 480
// },
// "VizWrapper.js": {
// "filename": "VizWrapper.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/5be24f7c071521d6664cd4f5824ebf33bf8973e3/VizWrapper.js",
// "size": 645
// },
// "axes.js": {
// "filename": "axes.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/d9b568c0d3c689f5636253cb02e1c19cc7cbcec7/axes.js",
// "size": 558
// },
// "bundle.js": {
// "filename": "bundle.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/439597339531b97d0d63b18c6f7be81c77a204b1/bundle.js",
// "size": 11839
// },
// "data.csv": {
// "filename": "data.csv",
// "type": "text/csv",
// "language": "CSV",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/381891c6c8df91beae9ab4e768dcce842f107520/data.csv",
// "size": 3857
// },
// "index.html": {
// "filename": "index.html",
// "type": "text/html",
// "language": "HTML",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/23e22fc3d4f471c026b97db75840e3836b93dd65/index.html",
// "size": 449
// },
// "index.js": {
// "filename": "index.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/ebc1e332e2bcff91dde9bf629455dff19715b6c7/index.js",
// "size": 143
// },
// "package.json": {
// "filename": "package.json",
// "type": "application/json",
// "language": "JSON",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/75ac131fe8ff991dfcecbfdaf0ad67d459e0be2b/package.json",
// "size": 136
// },
// "rollup.config.js": {
// "filename": "rollup.config.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/381d2c316f87c64b693fc54b3815894dd106ab81/rollup.config.js",
// "size": 297
// },
// "styles.css": {
// "filename": "styles.css",
// "type": "text/css",
// "language": "CSS",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/33f16fa4050efcdc2b87120226573199ba5ab16d/styles.css",
// "size": 97
// },
// "useData.js": {
// "filename": "useData.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/34e195b15138a6610dcf33e5547f0e63e9071ee8/useData.js",
// "size": 439
// },
// "viz.js": {
// "filename": "viz.js",
// "type": "application/javascript",
// "language": "JavaScript",
// "raw_url": "https://gist.githubusercontent.com/curran/a6c261aca1a12452111cb1b797c04d70/raw/a60db6dc1e02f62e445c107ca295e7623347fd14/viz.js",
// "size": 757
// }
// },
// "public": true,
// "created_at": "2022-06-05T13:33:26Z",
// "updated_at": "2022-06-05T13:36:47Z",
// "description": " React & D3 Starter",
// "comments": 0,
// "user": null,
// "comments_url": "https://api.github.com/gists/a6c261aca1a12452111cb1b797c04d70/comments",
// "owner": {
// "login": "curran",
// "id": 68416,
// "node_id": "MDQ6VXNlcjY4NDE2",
// "avatar_url": "https://avatars.githubusercontent.com/u/68416?v=4",
// "gravatar_id": "",
// "url": "https://api.github.com/users/curran",
// "html_url": "https://github.com/curran",
// "followers_url": "https://api.github.com/users/curran/followers",
// "following_url": "https://api.github.com/users/curran/following{/other_user}",
// "gists_url": "https://api.github.com/users/curran/gists{/gist_id}",
// "starred_url": "https://api.github.com/users/curran/starred{/owner}{/repo}",
// "subscriptions_url": "https://api.github.com/users/curran/subscriptions",
// "organizations_url": "https://api.github.com/users/curran/orgs",
// "repos_url": "https://api.github.com/users/curran/repos",
// "events_url": "https://api.github.com/users/curran/events{/privacy}",
// "received_events_url": "https://api.github.com/users/curran/received_events",
// "type": "User",
// "site_admin": false
// },
// "truncated": false
//}
//console.log(JSON.stringify(gist, null, 2));
//
// Fetch the content of each file individually.
const files = {};
for (const gistFile of Object.values(gist.files)) {
const { filename, raw_url } = gistFile;
const response = await fetch(raw_url);
const text = await response.text();
files[generateFileId()] = { name: filename, text };
await new Promise((resolve) =>
setTimeout(resolve, 1000)
);
}
// TODO generate actual ID
const id = generateId();
const owner = getVizHubUserIdFromGitHubUsername(
gist.owner.login
);
const viz = {
id,
vizInfo: {
id,
owner,
authors: [owner],
title: gist.description.trim(),
// Backfilled separately
forkedFrom: undefined,
createdTimestamp: gistDateToTimestamp(
gist.created_at
),
lastUpdatedTimestamp: gistDateToTimestamp(
gist.updated_at
),
},
vizContent: { id, files },
};
console.log('Create this viz:');
console.log(JSON.stringify(viz, null, 2));
await new Promise((resolve) =>
setTimeout(resolve, 1000)
);
}
}
};
migrate();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment