Skip to content

Instantly share code, notes, and snippets.

@ghandic
Last active June 5, 2023 11:40
Show Gist options
  • Save ghandic/dbde264a0d666a415bbf1bdcc3645aec to your computer and use it in GitHub Desktop.
Save ghandic/dbde264a0d666a415bbf1bdcc3645aec to your computer and use it in GitHub Desktop.
Load csv from S3 directly into memory and write to S3 directly from memory by extending pd.DataFrame class
import boto3
import pandas as pd
from io import StringIO
class S3DataFrame(pd.DataFrame):
"""
# Make a dataframe and upload it as csv
s3df = S3DataFrame({'h1':[1], 'h2':[2]})
s3df.to_s3(Bucket='bucket-name',
Key='file-key-on-s3', # The name of the file when it is stored in s3
SSEKMSKeyId='kms-id') # note: the kms should be in the same region as the bucket
# Download the same csv into DataFrame
s3df2 = S3DataFrame.from_s3(Bucket='bucket-name', Key='file-key-on-s3')
"""
client = boto3.client('s3') # May need region but seems to work without
def to_s3(self, Bucket, Key, SSEKMSKeyId):
"""Sends csv to S3 from memory"""
csv_buffer = StringIO()
self.to_csv(csv_buffer, index=False)
return self.client.put_object(Bucket=Bucket, Key=Key, Body=csv_buffer.getvalue(), SSEKMSKeyId=SSEKMSKeyId, ServerSideEncryption='aws:kms')
@classmethod
def from_s3(cls, Bucket, Key):
"""Collects csv into memory from S3"""
res = cls.client.get_object(Bucket=Bucket, Key=Key)['Body'].read()
return pd.read_csv(StringIO(res.decode('utf-8')))
if __name__ == "__main__":
BUCKET_NAME = 'bucket-name'
FILE_KEY = 'test.csv'
KMS_KEY_ID = 'kms-id'
# Creates a DataFrame and writes to S3
s3df = S3DataFrame({'h1':[1], 'h2':[2]})
s3df.to_s3(Bucket=BUCKET_NAME, Key=FILE_KEY, SSEKMSKeyId=KMS_KEY_ID)
# Reads from S3
s3df2 = S3DataFrame.from_s3(Bucket=BUCKET_NAME, Key=FILE_KEY)
@fmelihh
Copy link

fmelihh commented May 26, 2023

thx for contribution to community. this saves my day.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment