Skip to content

Instantly share code, notes, and snippets.

@braintreeps
Created February 21, 2013 16:48
Show Gist options
  • Save braintreeps/5006126 to your computer and use it in GitHub Desktop.
Save braintreeps/5006126 to your computer and use it in GitHub Desktop.
This is the R code to go along with the blog post "Mapping 35 Million Credit Cards On Top of Census Data With R".
library(data.table);
library(maps);
library(maptools);
library(spatstat);
library(zipcode);
library(GISTools)
#load the zipcode dataset
data(zipcode);
########## READ AND PREPARE THE DATA ###########
#read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name
zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character"))
#remove missing zip codes (there are a very small number of these)
zips0<-zips0[!is.na(zips0$postal_code),]
#get rid of some internal cards
zips0<-zips0[!grepl("Braintree", zips0$company_name),]
#get rid of zip codes that have any letters in them
zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),]
#for zips that are zip+4, just keep the first five digits
zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code));
zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5)
#now get rid of anything that's left that's not a well-formed 5 digit zip
zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),];
#and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues
#like foreign zips that could match onto a valid US zip for whatever reason
zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",]
#merge in the lat/long information from the zipcode dataset; from now on
#the "postal_code" column will be called "zip" because of the merge
mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code"))
#do some aggregation (count all cards in a zip) using a data.table, because it's much faster
mergedZipsDT<-data.table(mergedZips)
mergedZipsDT[, zipCount:=.N, by=list(zip)]
#keep only one row for each zip code
dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)])
########## READ IN THE CENSUS ZCTA DATA ###########
#read in the zip to zcta mapping (crosswalk) file
zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character", 5)))
#read in the "gazeteer" file that has population, latitude, and longitude for the zcta's
gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8)))
#merge everything together
#the inner merge maps our card zip/count data onto the zcta's using the crosswalk file
#the outer merge maps zcta onto population and lat/long centroid using the gazeteer file
zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID"))
#since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta
zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs)
zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)]
#again, keep only one row for each zcta
dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)])
#make a dataframe that will capture the card density type info that we want to plot
cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)])
colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop")
cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop)
#get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up
cardDensity<-cardDensity[cardDensity$pop!=0,]
#require that there be fewer than one card per capita and
#only look at places that have 100 people or more to exclude weird outliers
#like the NSA, airports, Grand Central, etc. where nobody lives
cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,]
########## POINTS PLOT ###########
longitudeLimits<-c(-130, -56)
latitudeLimits<-c(15, 60)
#add some random jitter
longitudeJitterSize<-diff(range(longitudeLimits))/5000.
latitudeJitterSize<-diff(range(latitudeLimits))/5000.
longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize)
latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize)
png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001);
map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
dev.off()
########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ###########
#make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors
#when we try use use weights below
points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE);
png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
spatstat.options(npixel=c(1000,1000));
densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]);
map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb")
image(densitymap, col=my.palette(40), add=TRUE);
map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
dev.off()
########## CHOROPLETH MAP ###########
#read in our census ZCTA shapefiles; the shapes objects has various slots:
#shapes@polygons has the actual polygon boundarys
#shapes@data has a dataframe, one row for each polygon
#it's important to retain the alignment between these two
shapes<-readShapeSpatial("tl_2010_us_zcta510.shp")
#convert the zip from a factor to a character
shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10]
#store all of our original row numbers, since we'll need to match them up later
shapes@data$original_rownum<-rownames(shapes@data)
#merge in our cardDensity data frame
#*********************************************************************************
#NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!!
#(TRUST ME, I WASTED A LOT OF TIME ON THIS)
#ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE
#A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS
#AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED
#*********************************************************************************
merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE);
merged<-merged[order(as.numeric(merged$original_rownum)),]
#*********************************************************************************
#force the row names of the data frame to be the same as the ID's of the polygons,
#because, according to
#http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html,
#things will get re-ordered otherwise and our plot will get scrambled
#this will allow you to see the row names of the polygons:
#ids<-sapply(shapes@polygons, function(x){x@ID})
#this will allow you to see the row names of the dataframe:
#dataids<-rownames(shapes@data)
#they have to match: identical(ids, dataids)
#*********************************************************************************
rownames(merged)<-merged$original_rownum
shapes@data<-merged
#restrict ourselves to the continental US
statesToRemove<-c("AK", "HI", "PR")
shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),]
#make ourselves a function for plotting various quantities
plotFullChoropleth<-function(quantity){
#make a color palette
my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab")
#pick our variable to color from
if(quantity=="POP_DENSITY"){
#POPULATION DENSITY
shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10))
} else if(quantity=="RAW_POP"){
#RAW POPULATION
shapes.plotvar<-shapes@data$pop
} else if(quantity=="TOT_AREA"){
#TOTAL AREA
shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10)
} else if(quantity=="RAW_CARDS"){
#RAW CARD COUNT
shapes.plotvar<-shapes@data$cardCount
} else if(quantity=="CARDS_PER_CAP"){
#PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY)
shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop)
} else if(quantity=="CARD_DENSITY"){
#CARD DENSITY
shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10))
} else {
stop("Dont' know that quantity!!")
}
#ignore any zcta's with outlier data
shapes.plotvar[is.na(shapes.plotvar)]<-0
shapes.plotvar[!is.finite(shapes.plotvar)]<-0
nShades<-80
shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1))
par.settings=list(axis.line=list(col=NA))
choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1)
map("state", col="white", lwd=1.0, add=TRUE);
}
png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1);
split.screen(c(2,2))
screen(1)
plotFullChoropleth("CARD_DENSITY")
screen(2)
plotFullChoropleth("POP_DENSITY")
screen(3)
plotFullChoropleth("CARDS_PER_CAP")
dev.off()
close.screen(all=TRUE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment