Skip to content

Instantly share code, notes, and snippets.

@enjalot
Created May 19, 2022 17:40
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save enjalot/c9e02ccf4826909509f43acbdd437494 to your computer and use it in GitHub Desktop.
Save enjalot/c9e02ccf4826909509f43acbdd437494 to your computer and use it in GitHub Desktop.
node script to convert csv to parquet
node duckdb.cjs mydata.csv

will create the file mydata.parquet with GZIP compression.

const duckdb = require('@duckdb/duckdb-wasm');
const path = require('path');
const fs = require("fs")
const Worker = require('web-worker');
const DUCKDB_DIST = path.dirname(require.resolve('@duckdb/duckdb-wasm'));
// node duckdb.cjs filename
// expects csv file in the same directory
let filename = process.argv[2]
filename = filename.replace(/\.csv/,"")
console.log("filename", filename)
(async () => {
try {
const DUCKDB_CONFIG = await duckdb.selectBundle({
mvp: {
mainModule: path.resolve(DUCKDB_DIST, './duckdb-mvp.wasm'),
mainWorker: path.resolve(DUCKDB_DIST, './duckdb-node-mvp.worker.cjs'),
},
next: {
mainModule: path.resolve(DUCKDB_DIST, './duckdb-next.wasm'),
mainWorker: path.resolve(DUCKDB_DIST, './duckdb-node-next.worker.cjs'),
},
});
const logger = new duckdb.ConsoleLogger();
const worker = new Worker(DUCKDB_CONFIG.mainWorker);
const db = new duckdb.AsyncDuckDB(logger, worker);
await db.instantiate(DUCKDB_CONFIG.mainModule, DUCKDB_CONFIG.pthreadWorker);
const conn = await db.connect();
console.log("reading")
const txt = fs.readFileSync(`${filename}.csv`).toString()
await db.registerFileText(`${filename}.csv`, txt);
await conn.insertCSVFromPath(`${filename}.csv`, {
schema: 'main',
name: filename,
})
console.log("inserted, querying")
let res = await conn.query(`SELECT * from ${filename} LIMIT 1`);
console.log(res.toArray())
await conn.query(`COPY (SELECT * FROM ${filename}) TO '${filename}.parquet' (FORMAT 'parquet', CODEC 'GZIP')`)
const buffer = await db.copyFileToBuffer(`${filename}.parquet`)
console.log("buffer len", buffer.length)
fs.writeFileSync(`${filename}.parquet`, Buffer.from(new Uint8Array(buffer)));
console.log("parquet file written")
await conn.close();
await db.terminate();
await worker.terminate();
console.log("all done")
} catch (e) {
console.error(e);
}
})();
{
"name": "bandcamp",
"version": "0.0.1",
"description": "",
"main": "duckdb.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"@duckdb/duckdb-wasm": "1.14.3"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment