Skip to content

Instantly share code, notes, and snippets.

@Mytherin
Created May 3, 2020 14:16
Show Gist options
  • Save Mytherin/37de933372eb608fb81b9bb64f2df44f to your computer and use it in GitHub Desktop.
Save Mytherin/37de933372eb608fb81b9bb64f2df44f to your computer and use it in GitHub Desktop.
import duckdb
import numpy as np
import pandas as pd
import time
lineitem = pd.read_csv('duckdb_benchmark_data/tpch_lineitem.csv', names=["l_orderkey","l_partkey","l_suppkey","l_linenumber","l_quantity","l_extendedprice","l_discount","l_tax","l_returnflag","l_linestatus","l_shipdate","l_commitdate","l_receiptdate","l_shipinstruct","l_shipmode","l_comment"], parse_dates=['l_shipdate', 'l_commitdate', 'l_receiptdate'])
def udf_disc_price(extended, discount):
return np.multiply(extended, np.subtract(1, discount))
def udf_charge(extended, discount, tax):
return np.multiply(extended, np.multiply(np.subtract(1, discount), np.add(1, tax)))
def q1():
df = lineitem[["l_shipdate", "l_returnflag", "l_linestatus", "l_quantity", "l_extendedprice", "l_discount", "l_tax"]][(lineitem['l_shipdate'] <= '1998-09-01')]
df['disc_price'] = udf_disc_price(df['l_extendedprice'], df['l_discount'])
df['charge'] = udf_charge(df['l_extendedprice'], df['l_discount'], df['l_tax'])
return df.groupby(['l_returnflag', 'l_linestatus'])\
.agg({'l_quantity': 'sum', 'l_extendedprice': 'sum', 'disc_price': 'sum', 'charge': 'sum',
'l_quantity': 'mean', 'l_extendedprice': 'mean', 'l_discount': 'mean', 'l_shipdate': 'count'})
start = time.time()
print(q1())
end = time.time()
print('Base Pandas: ' + str(end - start) + " seconds")
con = duckdb.connect(':memory:')
start = time.time()
print(con.from_df(lineitem).filter("l_shipdate <= cast('1998-09-02' AS date)").aggregate(
'''
l_returnflag,
l_linestatus,
sum(l_quantity) AS sum_qty,
sum(l_extendedprice) AS sum_base_price,
sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
avg(l_quantity) AS avg_qty,
avg(l_extendedprice) AS avg_price,
avg(l_discount) AS avg_disc,
count(*) AS count_order'''
))
end = time.time()
print('Pandas Scan: ' + str(end - start) + " seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment