Skip to content

Instantly share code, notes, and snippets.

@cboulanger
Last active May 28, 2021 13:54
Show Gist options
  • Save cboulanger/6be3e5aedb198d4a50e9320e373f02f5 to your computer and use it in GitHub Desktop.
Save cboulanger/6be3e5aedb198d4a50e9320e373f02f5 to your computer and use it in GitHub Desktop.
import {default as fetch} from 'node-fetch';
const { pdf } = require("pdf-to-img");
import {tmpdir} from "os";
import {createWriteStream, createReadStream} from 'fs';
import * as fsp from 'fs/promises'
import * as archiver from 'archiver';
import {ArchiverError} from "archiver";
import * as path from "path";
import {Parser, Builder} from "xml2js";
interface LoginResponse {
"auth": string,
"expires": number,
"user": {
"id": number,
"name": string,
"email": string,
"institute": string,
"admin": true|false
}
}
export type BookMetadata = {
author? : string,
title? : string,
description?: string,
language?: string,
profilerUrl?: string,
histPatterns?: string[],
year?: number,
pooled?: boolean
}
export class PocowebApi {
private readonly endpoint: string;
private token?: string;
/**
* @param {string} endpoint The API endpoint, without the /rest postfix
*/
constructor(endpoint:string) {
this.endpoint = endpoint;
}
/**
* Given a service path of the REST API (without the "rest/" prefix), return
* the final URL containing the authentication token.
* @param {string} path
* @param {{}} params?
* @protected
*/
protected serviceUrl(path: string, params: {[key: string]:any}={}) {
if (!this.token) {
throw new Error(`Client is not authenticated yet`);
}
params.auth = this.token;
const url = new URL(`${this.endpoint}/rest/${path}`);
url.search = (new URLSearchParams(params)).toString();
return url.toString();
}
/**
* Authenticates with the PocoWeb server
* @param {string} email
* @param {string} password
*/
public async authenticate(email: string, password: string) : Promise<void>{
let response;
try {
response = await fetch(`${this.endpoint}/rest/login`, {
body: JSON.stringify({email, password}),
method: "POST"
});
} catch (e) {
// to do
throw e;
}
const loginResponse = await response.json() as LoginResponse;
if (!loginResponse.auth) {
throw new Error("Could not authenticate");
}
this.token = loginResponse.auth;
}
/**
* Creates a PocoWeb-compliant project archive that can be uploaded
* from a source PDF and its corresponding ABBYY FineReader XML document.
* @param {String} pdfPath
* @param {String} xmlPath
* @return {String} Path to the created zip document
*/
public createArchive(pdfPath: string, xmlPath: string) : Promise<string>{
return new Promise<string>( async (resolve, reject) => {
const outputDir = tmpdir();
const archive = archiver('zip', {
zlib: { level: 9 }
});
const zipPath = path.join(outputDir, path.basename(pdfPath).replace(".pdf",".zip"));
const writeStream = createWriteStream(zipPath);
archive.pipe(writeStream);
writeStream.on('close', async () => {
console.log( `Finished writing archive ${zipPath} (${Math.round(archive.pointer()/1024)} kb).`);
resolve(zipPath);
});
archive.on('warning', (err: ArchiverError) => {
if (err.code === 'ENOENT') {
console.warn(err.message);
} else {
reject(err);
}
});
archive.on('error', (err: ArchiverError) => {
reject(err);
});
// parse xml
const xmlParser = new Parser();
const xmlSerializer = new Builder();
const xmlDoc = await xmlParser.parseStringPromise(await fsp.readFile(xmlPath, "utf8"));
const pages = xmlDoc.document.page;
// extract images from PDF
let counter = 1;
const pdfDoc = await pdf(pdfPath, {scale:3});
for await (const page of pdfDoc) {
const fileId = `page-${String(counter).padStart(3,"0")}`;
const imgFileName = `${fileId}.png`;
const imgFilePath = path.join(outputDir, imgFileName);
await fsp.writeFile(imgFilePath, page);
console.log(`Adding ${imgFileName}`);
archive.file(imgFilePath, { name: imgFileName });
// select XML description of page
xmlDoc.document.page = pages[counter-1];
const xml = xmlSerializer.buildObject(xmlDoc);
const xmlFileName = `${fileId}.xml`;
console.log(`Adding ${xmlFileName}`);
archive.append(xml, { name: xmlFileName });
counter++;
}
archive.finalize();
});
}
/**
* Uploads an archive
* @param {string} archivePath
* @param {BookMetadata?} meta
*/
public async uploadArchive(archivePath: string, meta: BookMetadata = {}) {
const url = this.serviceUrl("books", meta);
console.log(`Uploading ${path.basename(archivePath)} to ${this.endpoint}`);
const response = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/zip"
},
body: createReadStream(archivePath).on('error', e => {throw e})
});
const result = await response.json();
// error?
if (result.message || result.status || result.code) {
throw new Error(`${result.code} ${result.status}: ${result.message}`);
}
console.log(result);
}
}
async function uploadPdf(pdfPath, xmlPath) => {
const pw = new PocowebApi(process.env.POCOWEB_URL as string);
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0"; // to get around self-signed certificate error
await pw.authenticate(process.env.POCOWEB_EMAIL as string, process.env.POCOWEB_PASSWD as string);
const archivePath = await pw.createArchive(pdfPath, xmlPath);
const title = path.basename(pdfPath).replace(".pdf","");
await pw.uploadArchive(archivePath, {title});
console.log(`${title} uploaded.`);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment