Created
December 7, 2023 08:54
-
-
Save alexhornbake/89b2b67037640fa8377bfab95303b7ec to your computer and use it in GitHub Desktop.
Cost of converting to/from polars to join a dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# mprof run polars_test.py | |
# mprof: Sampling memory every 0.1s | |
# running new process | |
# running as a Python program... | |
# 1701938865.155861 - starting | |
# 1701938930.560897 - time to generate dataframes: 65.40504503250122 | |
# 1701938930.5609741 - starting pandas join | |
# Filename: polars_test.py | |
# Line # Mem usage Increment Occurrences Line Contents | |
# ============================================================= | |
# 20 743.5 MiB 743.5 MiB 1 @profile | |
# 21 def pandas_join(df1, df2): | |
# 22 1242.6 MiB 499.1 MiB 1 return df1.merge(df2, left_on=['string1', 'string2'], right_on=['string1', 'string2'], how='left') | |
# 1701938932.990345 - time to pandas join: 2.4293649196624756 | |
# 1701938932.9903831 - starting polars convert | |
# Filename: polars_test.py | |
# Line # Mem usage Increment Occurrences Line Contents | |
# ============================================================= | |
# 25 1204.7 MiB 1204.7 MiB 1 @profile | |
# 26 def polars_convert(df1, df2): | |
# 27 1190.5 MiB -14.2 MiB 1 pl_df1 = pl.from_pandas(df1) | |
# 28 1272.8 MiB 82.2 MiB 1 pl_df2 = pl.from_pandas(df2) | |
# 29 1272.8 MiB 0.0 MiB 1 return pl_df1, pl_df2 | |
# 1701938933.733557 - time to polars convert: 0.7436940670013428 | |
# 1701938933.734706 - starting polars join | |
# Filename: polars_test.py | |
# Line # Mem usage Increment Occurrences Line Contents | |
# ============================================================= | |
# 32 1264.8 MiB 1264.8 MiB 1 @profile | |
# 33 def polars_join(pl_df1, pl_df2): | |
# 34 1237.6 MiB -27.2 MiB 1 return pl_df1.join(pl_df2, on=['string1', 'string2'], how='left') | |
# 1701938933.9606578 - time to polars join: 0.22594881057739258 | |
# 1701938933.960665 - starting pandas convert | |
# Filename: polars_test.py | |
# Line # Mem usage Increment Occurrences Line Contents | |
# ============================================================= | |
# 37 1237.7 MiB 1237.7 MiB 1 @profile | |
# 38 def pandas_convert(pl_df1, pl_df2): | |
# 39 1381.2 MiB 143.5 MiB 1 df1 = pl_df1.to_pandas() | |
# 40 1158.6 MiB -222.6 MiB 1 df2 = pl_df2.to_pandas() | |
# 41 1158.6 MiB 0.0 MiB 1 return df1, df2 | |
# 1701938935.944544 - time to pandas convert: 1.9838781356811523 | |
import pandas as pd | |
import polars as pl | |
from faker import Faker | |
from memory_profiler import memory_usage, profile | |
import time | |
def generate_dataframe(num_rows, seed=None): | |
fake = Faker(seed) | |
data = { | |
f'string{i}': [fake.pystr(min_chars=16, max_chars=16) for _ in range(num_rows)] for i in range(5) | |
} | |
data.update({ | |
f'int{i}': [fake.random_int(min=0, max=1000) for _ in range(num_rows)] for i in range(5) | |
}) | |
df = pd.DataFrame(data) | |
return df.sample(frac=1, random_state=seed) # shuffle rows | |
@profile | |
def pandas_join(df1, df2): | |
return df1.merge(df2, left_on=['string1', 'string2'], right_on=['string1', 'string2'], how='left') | |
@profile | |
def polars_convert(df1, df2): | |
pl_df1 = pl.from_pandas(df1) | |
pl_df2 = pl.from_pandas(df2) | |
return pl_df1, pl_df2 | |
@profile | |
def polars_join(pl_df1, pl_df2): | |
return pl_df1.join(pl_df2, on=['string1', 'string2'], how='left') | |
@profile | |
def pandas_convert(pl_df1, pl_df2): | |
df1 = pl_df1.to_pandas() | |
df2 = pl_df2.to_pandas() | |
return df1, df2 | |
if __name__ == "__main__": | |
start = time.time() | |
print(start, " - starting") | |
# Generate the DataFrames with the same seed | |
df1 = generate_dataframe(1_000_000, seed=42) | |
df2 = generate_dataframe(1_000_000, seed=42) | |
print(time.time(), " - time to generate dataframes: ", time.time() - start) | |
print(time.time(), " - starting pandas join") | |
start = time.time() | |
pandas_join(df1, df2) | |
print(time.time(), " - time to pandas join: ", time.time() - start) | |
print(time.time(), " - starting polars convert") | |
start = time.time() | |
pl_df1, pl_df2 = polars_convert(df1, df2) | |
print(time.time(), " - time to polars convert: ", time.time() - start) | |
print(time.time(), " - starting polars join") | |
start = time.time() | |
polars_join(pl_df1, pl_df2) | |
print(time.time(), " - time to polars join: ", time.time() - start) | |
print(time.time(), " - starting pandas convert") | |
start = time.time() | |
df1, df2 = pandas_convert(pl_df1, pl_df2) | |
print(time.time(), " - time to pandas convert: ", time.time() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment