braintreeps · February 21, 2013 16:48
diff --git a/map.R b/map.R
 library(data.table);
 library(maps);
 library(maptools);
 library(spatstat);
 library(zipcode);
 library(GISTools)
 #load the zipcode dataset
 data(zipcode);


 ########## READ AND PREPARE THE DATA ###########

 #read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name
 zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character"))

 #remove missing zip codes (there are a very small number of these)
 zips0<-zips0[!is.na(zips0$postal_code),]

 #get rid of some internal cards
 zips0<-zips0[!grepl("Braintree", zips0$company_name),]

 #get rid of zip codes that have any letters in them
 zips0<-zips0[!grepl("[A-Z|a-z]", zips0$postal_code),]

 #for zips that are zip+4, just keep the first five digits
 zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code));
 zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5)

 #now get rid of anything that's left that's not a well-formed 5 digit zip
 zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),];

 #and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues
 #like foreign zips that could match onto a valid US zip for whatever reason
 zips0<-zips0[is.na(zips0$country_name) || tolower(zips0$country_name)=="united states of america",]

 #merge in the lat/long information from the zipcode dataset; from now on
 #the "postal_code" column will be called "zip" because of the merge
 mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code"))

 #do some aggregation (count all cards in a zip) using a data.table, because it's much faster
 mergedZipsDT<-data.table(mergedZips)
 mergedZipsDT[, zipCount:=.N, by=list(zip)]
 #keep only one row for each zip code
 dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)])


 ########## READ IN THE CENSUS ZCTA DATA ###########

 #read in the zip to zcta mapping (crosswalk) file
 zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character",  5)))
 #read in the "gazeteer" file that has population, latitude, and longitude for the zcta's
 gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8)))

 #merge everything together
 #the inner merge maps our card zip/count data onto the zcta's using the crosswalk file
 #the outer merge maps zcta onto population and lat/long centroid using the gazeteer file
 zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID"))

 #since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta
 zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs)
 zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)] 
 #again, keep only one row for each zcta
 dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)])

 #make a dataframe that will capture the card density type info that we want to plot
 cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)])
 colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop")
 cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop)
 #get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up
 cardDensity<-cardDensity[cardDensity$pop!=0,]

 #require that there be fewer than one card per capita and
 #only look at places that have 100 people or more to exclude weird outliers
 #like the NSA, airports, Grand Central, etc. where nobody lives
 cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,]


 ########## POINTS PLOT ###########

 longitudeLimits<-c(-130, -56)
 latitudeLimits<-c(15, 60)

 #add some random jitter
 longitudeJitterSize<-diff(range(longitudeLimits))/5000.
 latitudeJitterSize<-diff(range(latitudeLimits))/5000.
 longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize)
 latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize)

 png(filename="pointsmap.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
 map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
 points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001);
 map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
 dev.off()


 ########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ###########

 #make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors
 #when we try use use weights below
 points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE);

 png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.*960, height=10.*960, pointsize=1);
 spatstat.options(npixel=c(1000,1000));
 densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]);
 map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
 my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb")
 image(densitymap, col=my.palette(40), add=TRUE);
 map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
 dev.off()


 ########## CHOROPLETH MAP ###########

 #read in our census ZCTA shapefiles; the shapes objects has various slots:
 #shapes@polygons has the actual polygon boundarys
 #shapes@data has a dataframe, one row for each polygon
 #it's important to retain the alignment between these two 
 shapes<-readShapeSpatial("tl_2010_us_zcta510.shp")

 #convert the zip from a factor to a character
 shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10]
 #store all of our original row numbers, since we'll need to match them up later
 shapes@data$original_rownum<-rownames(shapes@data)

 #merge in our cardDensity data frame
 #*********************************************************************************
 #NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!!
 #(TRUST ME, I WASTED A LOT OF TIME ON THIS)
 #ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE
 #A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS
 #AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED
 #*********************************************************************************
 merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE);
 merged<-merged[order(as.numeric(merged$original_rownum)),]
 #*********************************************************************************
 #force the row names of the data frame to be the same as the ID's of the polygons, 
 #because, according to 
 #http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html, 
 #things will get re-ordered otherwise and our plot will get scrambled
 #this will allow you to see the row names of the polygons:
 #ids<-sapply(shapes@polygons, function(x){x@ID})
 #this will allow you to see the row names of the dataframe:
 #dataids<-rownames(shapes@data)
 #they have to match: identical(ids, dataids)
 #*********************************************************************************
 rownames(merged)<-merged$original_rownum
 shapes@data<-merged

 #restrict ourselves to the continental US
 statesToRemove<-c("AK", "HI", "PR")
 shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),]

 #make ourselves a function for plotting various quantities
 plotFullChoropleth<-function(quantity){
  
  #make a color palette
  my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab")

  #pick our variable to color from
  if(quantity=="POP_DENSITY"){
    #POPULATION DENSITY 
    shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10))
  } else if(quantity=="RAW_POP"){
    #RAW POPULATION
    shapes.plotvar<-shapes@data$pop
  } else if(quantity=="TOT_AREA"){
    #TOTAL AREA
    shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10)
  } else if(quantity=="RAW_CARDS"){
    #RAW CARD COUNT
    shapes.plotvar<-shapes@data$cardCount
  } else if(quantity=="CARDS_PER_CAP"){
    #PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY)
    shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop)
  } else if(quantity=="CARD_DENSITY"){
    #CARD DENSITY
    shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10))
  } else {
    stop("Dont' know that quantity!!")
  }

  #ignore any zcta's with outlier data
  shapes.plotvar[is.na(shapes.plotvar)]<-0
  shapes.plotvar[!is.finite(shapes.plotvar)]<-0

  nShades<-80
  shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1))

  par.settings=list(axis.line=list(col=NA)) 
  choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1)
  map("state", col="white", lwd=1.0, add=TRUE);

 }


 png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.*960, height=5.*960, pointsize=1);
 split.screen(c(2,2))
 screen(1)
 plotFullChoropleth("CARD_DENSITY")
 screen(2)
 plotFullChoropleth("POP_DENSITY")
 screen(3)
 plotFullChoropleth("CARDS_PER_CAP")
 dev.off()
 close.screen(all=TRUE)
	library(data.table);
	library(maps);
	library(maptools);
	library(spatstat);
	library(zipcode);
	library(GISTools)
	#load the zipcode dataset
	data(zipcode);


	########## READ AND PREPARE THE DATA ###########

	#read in our data; the csv has the following format: a key, a company_name, a postal_code, and a country_name
	zips0<-read.csv("mergedZips0.csv", header=TRUE, colClasses=c("numeric", "character", "character", "character"))

	#remove missing zip codes (there are a very small number of these)
	zips0<-zips0[!is.na(zips0$postal_code),]

	#get rid of some internal cards
	zips0<-zips0[!grepl("Braintree", zips0$company_name),]

	#get rid of zip codes that have any letters in them
	zips0<-zips0[!grepl("[A-Z\|a-z]", zips0$postal_code),]

	#for zips that are zip+4, just keep the first five digits
	zipPlus4Indices<-which(grepl('^[0-9]{5}-', zips0$postal_code));
	zips0[zipPlus4Indices, "postal_code"]<-substr(zips0[zipPlus4Indices,"postal_code"], 1, 5)

	#now get rid of anything that's left that's not a well-formed 5 digit zip
	zips0<-zips0[grepl('^[0-9]{5}', zips0$postal_code),];

	#and finally, get rid of anyone who has a country that's not the US to avoid any weird matching issues
	#like foreign zips that could match onto a valid US zip for whatever reason
	zips0<-zips0[is.na(zips0$country_name) \|\| tolower(zips0$country_name)=="united states of america",]

	#merge in the lat/long information from the zipcode dataset; from now on
	#the "postal_code" column will be called "zip" because of the merge
	mergedZips<-merge(zipcode, zips0, by.x=c("zip"), by.y=c("postal_code"))

	#do some aggregation (count all cards in a zip) using a data.table, because it's much faster
	mergedZipsDT<-data.table(mergedZips)
	mergedZipsDT[, zipCount:=.N, by=list(zip)]
	#keep only one row for each zip code
	dedupedMergedZips<-unique(mergedZipsDT[,list(zip,latitude,longitude,zipCount)])


	########## READ IN THE CENSUS ZCTA DATA ###########

	#read in the zip to zcta mapping (crosswalk) file
	zctaToZip<-read.csv("Zip_to_ZCTA_Crosswalk_2011_JSI.csv", header=TRUE, colClasses=c(rep("character", 5)))
	#read in the "gazeteer" file that has population, latitude, and longitude for the zcta's
	gaz<-read.table("Gaz_zcta_national.txt", header=TRUE, colClasses=c("character", rep("numeric", 8)))

	#merge everything together
	#the inner merge maps our card zip/count data onto the zcta's using the crosswalk file
	#the outer merge maps zcta onto population and lat/long centroid using the gazeteer file
	zctaWithZipsAndLatLongs<-merge(merge(as.data.frame(dedupedMergedZips), zctaToZip, by.x=c("zip"), by.y=c("ZIP")), gaz, by.x=c("ZCTA_USE"), by.y=c("GEOID"))

	#since multiple zips can be mapped onto the same zcta, we have to do some aggregation again, this time over zcta
	zctaWithZipsAndLatLongsDT<-data.table(zctaWithZipsAndLatLongs)
	zctaWithZipsAndLatLongsDT[, totalZipCount:=sum(zipCount), by=list(ZCTA_USE)]
	#again, keep only one row for each zcta
	dedupedZctaWithZipsAndLatLongsDT<-unique(zctaWithZipsAndLatLongsDT[,list(ZCTA_USE,INTPTLAT,INTPTLONG,totalZipCount,POP10)])

	#make a dataframe that will capture the card density type info that we want to plot
	cardDensity<-as.data.frame(dedupedZctaWithZipsAndLatLongsDT[,list(ZCTA_USE, INTPTLAT, INTPTLONG, totalZipCount, POP10)])
	colnames(cardDensity)<-c("zcta", "latitude", "longitude", "cardCount", "pop")
	cardDensity$cardDensity<-(cardDensity$cardCount/cardDensity$pop)
	#get rid of a couple of places where the population is zero, which can cause the cardDensity to blow up
	cardDensity<-cardDensity[cardDensity$pop!=0,]

	#require that there be fewer than one card per capita and
	#only look at places that have 100 people or more to exclude weird outliers
	#like the NSA, airports, Grand Central, etc. where nobody lives
	cardDensity<-cardDensity[cardDensity$cardDensity<1. & cardDensity$pop>=100,]


	########## POINTS PLOT ###########

	longitudeLimits<-c(-130, -56)
	latitudeLimits<-c(15, 60)

	#add some random jitter
	longitudeJitterSize<-diff(range(longitudeLimits))/5000.
	latitudeJitterSize<-diff(range(latitudeLimits))/5000.
	longitudeJitters<-runif(nrow(mergedZips), -longitudeJitterSize, longitudeJitterSize)
	latitudeJitters<-runif(nrow(mergedZips), -latitudeJitterSize, latitudeJitterSize)

	png(filename="pointsmap.png", type="quartz", bg="white", width=10.960, height=10.960, pointsize=1);
	map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
	points(mergedZips$longitude+longitudeJitters, mergedZips$latitude+latitudeJitters, col="#FA9C58", pch=19, cex=0.001);
	map("state", col="white", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
	dev.off()


	########## HEATMAP WITH WEIGHTS (MUCH, MUCH FASTER TO DO IT THIS WAY...) ###########

	#make the 2D histogram; need to have the check=FALSE option set, otherwise we throw away points, which messes up the length of the vectors
	#when we try use use weights below
	points<-ppp(dedupedMergedZips[!is.na(dedupedMergedZips$longitude), longitude], dedupedMergedZips[!is.na(dedupedMergedZips$latitude), latitude], longitudeLimits, latitudeLimits, check=FALSE);

	png(filename="heatmap_weighted.png", type="quartz", bg="white", width=10.960, height=10.960, pointsize=1);
	spatstat.options(npixel=c(1000,1000));
	densitymap<-density(points, sigma=0.15, weights=dedupedMergedZips[!is.na(dedupedMergedZips$longitude), zipCount]);
	map("state", col="#000000", fill=TRUE, bg="#FFFFFF", lwd=1.0, xlim=longitudeLimits, ylim=latitudeLimits);
	my.palette <- colorRampPalette(c("black", "gray", "orange", "white"), bias=5, space="rgb")
	image(densitymap, col=my.palette(40), add=TRUE);
	map("state", col="white", lwd=4.0, xlim=longitudeLimits, ylim=latitudeLimits, add=TRUE);
	dev.off()


	########## CHOROPLETH MAP ###########

	#read in our census ZCTA shapefiles; the shapes objects has various slots:
	#shapes@polygons has the actual polygon boundarys
	#shapes@data has a dataframe, one row for each polygon
	#it's important to retain the alignment between these two
	shapes<-readShapeSpatial("tl_2010_us_zcta510.shp")

	#convert the zip from a factor to a character
	shapes@data$ZCTA_CHAR<-levels(shapes@data$ZCTA5CE10)[shapes@data$ZCTA5CE10]
	#store all of our original row numbers, since we'll need to match them up later
	shapes@data$original_rownum<-rownames(shapes@data)

	#merge in our cardDensity data frame
	#*********************************************************************************
	#NOTE: MERGE SORTS THE ROWS BY DEFAULT, AND WE REALLY, REALLY DON'T WANT THAT!!!!!
	#(TRUST ME, I WASTED A LOT OF TIME ON THIS)
	#ALSO, WHEN ALL.X IS SET TO TRUE, IT PUTS THE ROWS WITH NO MATCH AT THE END, UNLIKE
	#A REAL LEFT JOIN OR RIGHT JOIN IN SQL; SO WE HAVE TO SAVE THE ORIGINAL ROW NUMBERS
	#AND RE-ORDER BY THEM SO THAT THINGS DON'T GET SCRAMBLED
	#*********************************************************************************
	merged<-merge(shapes@data, cardDensity, by.x=c("ZCTA_CHAR"), by.y=c("zcta"), all.x=TRUE, sort=FALSE);
	merged<-merged[order(as.numeric(merged$original_rownum)),]
	#*********************************************************************************
	#force the row names of the data frame to be the same as the ID's of the polygons,
	#because, according to
	#http://rss.acs.unt.edu/Rdoc/library/sp/html/SpatialPolygonsDataFrame-class.html,
	#things will get re-ordered otherwise and our plot will get scrambled
	#this will allow you to see the row names of the polygons:
	#ids<-sapply(shapes@polygons, function(x){x@ID})
	#this will allow you to see the row names of the dataframe:
	#dataids<-rownames(shapes@data)
	#they have to match: identical(ids, dataids)
	#*********************************************************************************
	rownames(merged)<-merged$original_rownum
	shapes@data<-merged

	#restrict ourselves to the continental US
	statesToRemove<-c("AK", "HI", "PR")
	shapes<-shapes[-which(shapes@data$ZCTA_CHAR %in% zctaToZip[zctaToZip$StateAbbr %in% statesToRemove,"ZCTA_USE"]),]

	#make ourselves a function for plotting various quantities
	plotFullChoropleth<-function(quantity){

	#make a color palette
	my.palette <- colorRampPalette(c("black", "orange"), bias=1, space="Lab")

	#pick our variable to color from
	if(quantity=="POP_DENSITY"){
	#POPULATION DENSITY
	shapes.plotvar<-(shapes@data$pop/(shapes@data$ALAND10+shapes@data$AWATER10))
	} else if(quantity=="RAW_POP"){
	#RAW POPULATION
	shapes.plotvar<-shapes@data$pop
	} else if(quantity=="TOT_AREA"){
	#TOTAL AREA
	shapes.plotvar<-(shapes@data$ALAND10+shapes@data$AWATER10)
	} else if(quantity=="RAW_CARDS"){
	#RAW CARD COUNT
	shapes.plotvar<-shapes@data$cardCount
	} else if(quantity=="CARDS_PER_CAP"){
	#PER CAPITA CARDS (TELLS US HOW THINGS SCALE WITH POPULATION DENSITY)
	shapes.plotvar<-(shapes@data$cardCount/shapes@data$pop)
	} else if(quantity=="CARD_DENSITY"){
	#CARD DENSITY
	shapes.plotvar<-(shapes@data$cardCount/(shapes@data$ALAND10+shapes@data$AWATER10))
	} else {
	stop("Dont' know that quantity!!")
	}

	#ignore any zcta's with outlier data
	shapes.plotvar[is.na(shapes.plotvar)]<-0
	shapes.plotvar[!is.finite(shapes.plotvar)]<-0

	nShades<-80
	shades<-auto.shading(shapes.plotvar ,n=nShades, cols=my.palette(nShades+1))

	par.settings=list(axis.line=list(col=NA))
	choropleth(shapes, v=shapes.plotvar, shades, bg="#000000", border=NA, lwd=0.1)
	map("state", col="white", lwd=1.0, add=TRUE);

	}


	png(filename="chloroplethmap.png", type="quartz", bg="white", width=10.960, height=5.960, pointsize=1);
	split.screen(c(2,2))
	screen(1)
	plotFullChoropleth("CARD_DENSITY")
	screen(2)
	plotFullChoropleth("POP_DENSITY")
	screen(3)
	plotFullChoropleth("CARDS_PER_CAP")
	dev.off()
	close.screen(all=TRUE)