Last active
April 13, 2018 05:02
-
-
Save jonpage/7f47cb9aea6ef9fb67e72823ee93136b to your computer and use it in GitHub Desktop.
Rmd for creating the data file used in the Observable notebook
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Raking it in: Hawaii's High-Income Zip Codes" | |
author: "Jonathan Page" | |
date: "April 11, 2018" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
Honolulu CPI Annual Averages from https://www.bls.gov/regions/west/data/consumerpriceindex_honolulu_table.pdf | |
to adjust everything to 2017 values. | |
```{r data-setup, echo=FALSE} | |
cpi2017 <- 272.014 | |
cpi2015 <- 260.165 | |
cpi2012 <- 249.474 | |
cpi2010 <- 234.869 | |
``` | |
Average Monthly Median Home Values for 2017 and 5yr percent change in real terms | |
```{r} | |
library(tidyverse) | |
all_homes_raw <- read_csv("http://files.zillowstatic.com/research/public/Zip/Zip_Zhvi_AllHomes.csv") | |
all_homes <- all_homes_raw %>% | |
filter(State == "HI") %>% | |
gather(month, value, contains("-"), convert = TRUE) %>% | |
mutate( | |
year = stringr::str_extract(month, "^\\d{4}") %>% as.numeric(), | |
month = stringr::str_extract(month, "\\d{2}$") %>% as.numeric(), | |
zipcode = as.numeric(RegionName) | |
) %>% | |
# this would match the Feb 2017 - Feb 2018 range mentioned in Bloomberg | |
# but we'll use 2017 and 2012 data to create a nice comparison with our other 5YR change variables | |
# this 5YR gap is convenient because 2012 is the first year that estimates are available for all | |
# zipcodes and 2017 is the last year of complete data. | |
# filter((year == 2017 && month >= 2) || (year == 2018 && month >= 2)) %>% | |
# group_by(zipcode) | |
# summarize(value = mean(value, na.rm = TRUE)) | |
filter(year %in% c(2012, 2017)) %>% | |
group_by(zipcode, year) %>% | |
summarize(value = mean(value, na.rm = TRUE)) %>% | |
spread(year, value) %>% | |
mutate( | |
median_home_2017 = `2017` * 1e-6, | |
median_home_2012 = `2012` * 1e-6 * (cpi2017 / cpi2012), | |
median_home_5yr = (median_home_2017 - median_home_2012) / median_home_2012 * 100 | |
) %>% | |
select(zipcode, median_home_2017, median_home_5yr) | |
``` | |
2015 IRS data | |
```{r} | |
# proportion of returns above 200K (caveat: this can't be adjusted for inflation) | |
prop200k_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallagi.csv") %>% | |
filter( | |
STATE == "HI", | |
zipcode != "00000", | |
zipcode != "99999" | |
) %>% | |
mutate(zipcode = as.numeric(zipcode)) %>% | |
group_by(zipcode) %>% | |
mutate(prop200k_2015 = N1 / sum(N1) * 100) %>% | |
filter(agi_stub == 6) %>% | |
select(zipcode, prop200k_2015) | |
agi_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallnoagi.csv") %>% | |
filter( | |
STATE == "HI", | |
ZIPCODE != "00000", | |
ZIPCODE != "99999" | |
) %>% | |
transmute( | |
zipcode = as.numeric(ZIPCODE), | |
return_count_2015 = N1, | |
avg_agi_2015 = A00100 * (cpi2017 / cpi2015) / N1, | |
wage_per_return_2015 = A00200 * (cpi2017 / cpi2015) / N1, | |
wage_per_wreturn_2015 = A00200 * (cpi2017 / cpi2015) / N00200, | |
deduct_per_return_2015 = A04470 * (cpi2017 / cpi2015) / N1, | |
deduct_per_dreturn_2015 = A04470 * (cpi2017 / cpi2015) / N04470 | |
) | |
``` | |
2010 IRS data | |
```{r} | |
download.file("https://www.irs.gov/pub/irs-soi/2010zipcode.zip", destfile = "2010zipcode.zip") | |
unzip("2010zipcode.zip", files = c("10zpallnoagi.csv", "10zpallagi.csv")) | |
``` | |
```{r} | |
# proportion of returns above 200K (caveat: this can't be adjusted for inflation) | |
prop200k_2010 <- read_csv("10zpallagi.csv") %>% | |
filter( | |
STATE == "HI", | |
zipcode != "00000", | |
zipcode != "99999" | |
) %>% | |
mutate(zipcode = as.numeric(zipcode)) %>% | |
group_by(zipcode) %>% | |
mutate(prop200k_2010 = N1 / sum(N1) * 100) %>% | |
filter(agi_stub == 6) %>% | |
select(zipcode, prop200k_2010) | |
agi_2010 <- read_csv("10zpallnoagi.csv") %>% | |
filter( | |
STATE == "HI", | |
ZIPCODE != "00000", | |
ZIPCODE != "99999" | |
) %>% | |
transmute( | |
zipcode = as.numeric(ZIPCODE), | |
return_count_2010 = N1, | |
avg_agi_2010 = A00100 * (cpi2017 / cpi2010) / N1, | |
wage_per_return_2010 = A00200 * (cpi2017 / cpi2010) / N1, | |
wage_per_wreturn_2010 = A00200 * (cpi2017 / cpi2010) / N00200, | |
deduct_per_return_2010 = A04470 * (cpi2017 / cpi2010) / N1, | |
deduct_per_dreturn_2010 = A04470 * (cpi2017 / cpi2010) / N04470 | |
) | |
``` | |
Create 5YR variables | |
```{r} | |
propk200k <- prop200k_2010 %>% | |
full_join(prop200k_2015) %>% | |
mutate(prop200k_5yr = (prop200k_2015 - prop200k_2010) / prop200k_2010 * 100) %>% | |
select(zipcode, prop200k_2015, prop200k_5yr) | |
agi <- agi_2010 %>% | |
full_join(agi_2015) %>% | |
mutate( | |
return_count_5yr = (return_count_2015 - return_count_2010) / return_count_2010 * 100, | |
avg_agi_5yr = (avg_agi_2015 - avg_agi_2010) / avg_agi_2010 * 100, | |
wage_per_return_5yr = (wage_per_return_2015 - wage_per_return_2010) / wage_per_return_2010 * 100, | |
wage_per_wreturn_5yr = (wage_per_wreturn_2015 - wage_per_wreturn_2010) / wage_per_wreturn_2010 * 100, | |
deduct_per_return_5yr = (deduct_per_return_2015 - deduct_per_return_2010) / deduct_per_return_2010 * 100, | |
deduct_per_dreturn_5yr = (deduct_per_dreturn_2015 - deduct_per_dreturn_2010) / deduct_per_dreturn_2010 * 100 | |
) %>% | |
select(-contains("2010")) | |
``` | |
Census residential household count | |
Download CSV from https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/H1/0400000US15.86000P | |
```{r} | |
housing_units <- read_csv("DEC_10_SF1_H1.csv") %>% | |
transmute(zipcode = GEO.id2, housing_units = D001) | |
``` | |
ZIP code names from DBEDT http://files.hawaii.gov/dbedt/economic/databook/2015-individual/01/011915.xls | |
```{r} | |
zipnames <- read_csv("011915_clean.csv") | |
``` | |
```{r} | |
combined_data <- agi %>% | |
left_join(propk200k) %>% | |
left_join(all_homes) %>% | |
left_join(housing_units) %>% | |
left_join(zipnames) | |
write_csv(combined_data, path = "hi_richest_zip.csv", na = "") | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment