Skip to content

Instantly share code, notes, and snippets.

@meconlin
Created September 21, 2015 15:24
Show Gist options
  • Save meconlin/fcd0129da4d6f7e8b6ba to your computer and use it in GitHub Desktop.
Save meconlin/fcd0129da4d6f7e8b6ba to your computer and use it in GitHub Desktop.
aws data pipeline unzip from s3 -> to s3 filtering csv via awk
{
"objects": [
{
"directoryPath": "#{myS3OutputLoc}/",
"name": "S3OutputLocation",
"id": "S3OutputLocation",
"type": "S3DataNode"
},
{
"period": "1 day",
"name": "Every 1 day",
"id": "DefaultSchedule",
"type": "Schedule",
"startAt": "FIRST_ACTIVATION_DATE_TIME"
},
{
"directoryPath": "#{myS3InputLoc}",
"name": "S3InputLocation",
"id": "S3InputLocation",
"type": "S3DataNode"
},
{
"output": {
"ref": "S3OutputLocation"
},
"input": {
"ref": "S3InputLocation"
},
"stage": "true",
"name": "ShellCommandActivityObj",
"id": "ShellCommandActivityObj",
"runsOn": {
"ref": "EC2ResourceObj"
},
"type": "ShellCommandActivity",
"command": "#{myShellCmd}"
},
{
"failureAndRerunMode": "CASCADE",
"schedule": {
"ref": "DefaultSchedule"
},
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://carlingo.datapipeline.logs/",
"scheduleType": "cron",
"name": "Default",
"id": "Default"
},
{
"instanceType": "t1.micro",
"name": "EC2ResourceObj",
"id": "EC2ResourceObj",
"type": "Ec2Resource",
"terminateAfter": "20 Minutes"
}
],
"parameters": [
{
"description": "S3 output folder",
"id": "myS3OutputLoc",
"type": "AWS::S3::ObjectKey"
},
{
"default": "s3://us-east-1.elasticmapreduce.samples/pig-apache-logs/data",
"description": "S3 input folder",
"id": "myS3InputLoc",
"type": "AWS::S3::ObjectKey"
},
{
"default": "grep -rc \"GET\" ${INPUT1_STAGING_DIR}/* > ${OUTPUT1_STAGING_DIR}/output.txt",
"description": "Shell command to run",
"id": "myShellCmd",
"type": "String"
}
],
"values": {
"myShellCmd": "unzip -p ${INPUT1_STAGING_DIR}/DataOne_US_LDV_Data.zip VIN_REFERENCE.csv | awk -F '\",\"' '$4 >= 2008 { print }' > ${OUTPUT1_STAGING_DIR}/vin_reference.csv",
"myS3InputLoc": "s3://carlingo.datapipeline.data/staging/",
"myS3OutputLoc": "s3://carlingo.datapipeline.data/staging/extracted"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment