jonpage · April 13, 2018 05:02
diff --git a/hi_richest_zip.Rmd b/hi_richest_zip.Rmd
 ---
 title: "Raking it in: Hawaii's High-Income Zip Codes"
 author: "Jonathan Page"
 date: "April 11, 2018"
 output: html_document
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```


 Honolulu CPI Annual Averages from https://www.bls.gov/regions/west/data/consumerpriceindex_honolulu_table.pdf
 to adjust everything to 2017 values.
 ```{r data-setup, echo=FALSE}
 cpi2017 <- 272.014
 cpi2015 <- 260.165
 cpi2012 <- 249.474
 cpi2010 <- 234.869
 ```

 Average Monthly Median Home Values for 2017 and 5yr percent change in real terms
 ```{r}
 library(tidyverse)
 all_homes_raw <- read_csv("http://files.zillowstatic.com/research/public/Zip/Zip_Zhvi_AllHomes.csv")
 all_homes <- all_homes_raw %>%
  filter(State == "HI") %>%
  gather(month, value, contains("-"), convert = TRUE) %>%
  mutate(
    year = stringr::str_extract(month, "^\\d{4}") %>% as.numeric(),
    month = stringr::str_extract(month, "\\d{2}$") %>% as.numeric(),
    zipcode = as.numeric(RegionName)
  ) %>%
  # this would match the Feb 2017 - Feb 2018 range mentioned in Bloomberg
  # but we'll use 2017 and 2012 data to create a nice comparison with our other 5YR change variables
  # this 5YR gap is convenient because 2012 is the first year that estimates are available for all
  # zipcodes and 2017 is the last year of complete data.
  # filter((year == 2017 && month >= 2) || (year == 2018 && month >= 2)) %>% 
  # group_by(zipcode)
  # summarize(value = mean(value, na.rm = TRUE))
  filter(year %in% c(2012, 2017)) %>%
  group_by(zipcode, year) %>%
  summarize(value = mean(value, na.rm = TRUE)) %>%
  spread(year, value) %>%
  mutate(
    median_home_2017 = `2017` * 1e-6,
    median_home_2012 = `2012` * 1e-6 * (cpi2017 / cpi2012),
    median_home_5yr = (median_home_2017 - median_home_2012) / median_home_2012 * 100
  ) %>%
  select(zipcode, median_home_2017, median_home_5yr)
 ```

 2015 IRS data
 ```{r}
 # proportion of returns above 200K (caveat: this can't be adjusted for inflation)
 prop200k_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallagi.csv") %>%
  filter(
    STATE == "HI", 
    zipcode != "00000", 
    zipcode != "99999"
  ) %>%
  mutate(zipcode = as.numeric(zipcode)) %>%
  group_by(zipcode) %>%
  mutate(prop200k_2015 = N1 / sum(N1) * 100) %>%
  filter(agi_stub == 6) %>%
  select(zipcode, prop200k_2015)

 agi_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallnoagi.csv") %>%
    filter(
    STATE == "HI", 
    ZIPCODE != "00000", 
    ZIPCODE != "99999"
  ) %>%
  transmute(
    zipcode = as.numeric(ZIPCODE),
    return_count_2015 = N1,
    avg_agi_2015 = A00100 * (cpi2017 / cpi2015) / N1,
    wage_per_return_2015 = A00200 * (cpi2017 / cpi2015) / N1,
    wage_per_wreturn_2015 = A00200 * (cpi2017 / cpi2015) / N00200,
    deduct_per_return_2015 = A04470 * (cpi2017 / cpi2015) / N1,
    deduct_per_dreturn_2015 = A04470 * (cpi2017 / cpi2015) / N04470
  )
 ```


 2010 IRS data
 ```{r}
 download.file("https://www.irs.gov/pub/irs-soi/2010zipcode.zip", destfile = "2010zipcode.zip")
 unzip("2010zipcode.zip", files = c("10zpallnoagi.csv", "10zpallagi.csv"))
 ```

 ```{r}
 # proportion of returns above 200K (caveat: this can't be adjusted for inflation)
 prop200k_2010 <- read_csv("10zpallagi.csv") %>%
  filter(
    STATE == "HI", 
    zipcode != "00000", 
    zipcode != "99999"
  ) %>%
  mutate(zipcode = as.numeric(zipcode)) %>%
  group_by(zipcode) %>%
  mutate(prop200k_2010 = N1 / sum(N1) * 100) %>%
  filter(agi_stub == 6) %>%
  select(zipcode, prop200k_2010)

 agi_2010 <- read_csv("10zpallnoagi.csv") %>%
    filter(
    STATE == "HI", 
    ZIPCODE != "00000", 
    ZIPCODE != "99999"
  ) %>%
  transmute(
    zipcode = as.numeric(ZIPCODE),
    return_count_2010 = N1,
    avg_agi_2010 = A00100 * (cpi2017 / cpi2010) / N1,
    wage_per_return_2010 = A00200 * (cpi2017 / cpi2010) / N1,
    wage_per_wreturn_2010 = A00200 * (cpi2017 / cpi2010) / N00200,
    deduct_per_return_2010 = A04470 * (cpi2017 / cpi2010) / N1,
    deduct_per_dreturn_2010 = A04470 * (cpi2017 / cpi2010) / N04470
  )
 ```

 Create 5YR variables
 ```{r}
 propk200k <- prop200k_2010 %>%
  full_join(prop200k_2015) %>%
  mutate(prop200k_5yr = (prop200k_2015 - prop200k_2010) / prop200k_2010 * 100) %>%
  select(zipcode, prop200k_2015, prop200k_5yr)
 agi <- agi_2010 %>%
  full_join(agi_2015) %>%
  mutate(
    return_count_5yr = (return_count_2015 - return_count_2010) / return_count_2010 * 100,
    avg_agi_5yr = (avg_agi_2015 - avg_agi_2010) / avg_agi_2010 * 100,
    wage_per_return_5yr = (wage_per_return_2015 - wage_per_return_2010) / wage_per_return_2010 * 100,
    wage_per_wreturn_5yr = (wage_per_wreturn_2015 - wage_per_wreturn_2010) / wage_per_wreturn_2010 * 100,
    deduct_per_return_5yr = (deduct_per_return_2015 - deduct_per_return_2010) / deduct_per_return_2010 * 100,
    deduct_per_dreturn_5yr = (deduct_per_dreturn_2015 - deduct_per_dreturn_2010) / deduct_per_dreturn_2010 * 100
  ) %>%
  select(-contains("2010"))
 ```

 Census residential household count
 Download CSV from https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/H1/0400000US15.86000P
 ```{r}
 housing_units <- read_csv("DEC_10_SF1_H1.csv") %>%
  transmute(zipcode = GEO.id2, housing_units = D001)
 ```

 ZIP code names from DBEDT http://files.hawaii.gov/dbedt/economic/databook/2015-individual/01/011915.xls

 ```{r}
 zipnames <- read_csv("011915_clean.csv")
 ```

 ```{r}
 combined_data <- agi %>%
  left_join(propk200k) %>%
  left_join(all_homes) %>%
  left_join(housing_units) %>%
  left_join(zipnames)
 write_csv(combined_data, path = "hi_richest_zip.csv", na = "")
 ```
	---
	title: "Raking it in: Hawaii's High-Income Zip Codes"
	author: "Jonathan Page"
	date: "April 11, 2018"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```


	Honolulu CPI Annual Averages from https://www.bls.gov/regions/west/data/consumerpriceindex_honolulu_table.pdf
	to adjust everything to 2017 values.
	```{r data-setup, echo=FALSE}
	cpi2017 <- 272.014
	cpi2015 <- 260.165
	cpi2012 <- 249.474
	cpi2010 <- 234.869
	```

	Average Monthly Median Home Values for 2017 and 5yr percent change in real terms
	```{r}
	library(tidyverse)
	all_homes_raw <- read_csv("http://files.zillowstatic.com/research/public/Zip/Zip_Zhvi_AllHomes.csv")
	all_homes <- all_homes_raw %>%
	filter(State == "HI") %>%
	gather(month, value, contains("-"), convert = TRUE) %>%
	mutate(
	year = stringr::str_extract(month, "^\\d{4}") %>% as.numeric(),
	month = stringr::str_extract(month, "\\d{2}$") %>% as.numeric(),
	zipcode = as.numeric(RegionName)
	) %>%
	# this would match the Feb 2017 - Feb 2018 range mentioned in Bloomberg
	# but we'll use 2017 and 2012 data to create a nice comparison with our other 5YR change variables
	# this 5YR gap is convenient because 2012 is the first year that estimates are available for all
	# zipcodes and 2017 is the last year of complete data.
	# filter((year == 2017 && month >= 2) \|\| (year == 2018 && month >= 2)) %>%
	# group_by(zipcode)
	# summarize(value = mean(value, na.rm = TRUE))
	filter(year %in% c(2012, 2017)) %>%
	group_by(zipcode, year) %>%
	summarize(value = mean(value, na.rm = TRUE)) %>%
	spread(year, value) %>%
	mutate(
	median_home_2017 = `2017` * 1e-6,
	median_home_2012 = `2012` * 1e-6 * (cpi2017 / cpi2012),
	median_home_5yr = (median_home_2017 - median_home_2012) / median_home_2012 * 100
	) %>%
	select(zipcode, median_home_2017, median_home_5yr)
	```

	2015 IRS data
	```{r}
	# proportion of returns above 200K (caveat: this can't be adjusted for inflation)
	prop200k_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallagi.csv") %>%
	filter(
	STATE == "HI",
	zipcode != "00000",
	zipcode != "99999"
	) %>%
	mutate(zipcode = as.numeric(zipcode)) %>%
	group_by(zipcode) %>%
	mutate(prop200k_2015 = N1 / sum(N1) * 100) %>%
	filter(agi_stub == 6) %>%
	select(zipcode, prop200k_2015)

	agi_2015 <- read_csv("https://www.irs.gov/pub/irs-soi/15zpallnoagi.csv") %>%
	filter(
	STATE == "HI",
	ZIPCODE != "00000",
	ZIPCODE != "99999"
	) %>%
	transmute(
	zipcode = as.numeric(ZIPCODE),
	return_count_2015 = N1,
	avg_agi_2015 = A00100 * (cpi2017 / cpi2015) / N1,
	wage_per_return_2015 = A00200 * (cpi2017 / cpi2015) / N1,
	wage_per_wreturn_2015 = A00200 * (cpi2017 / cpi2015) / N00200,
	deduct_per_return_2015 = A04470 * (cpi2017 / cpi2015) / N1,
	deduct_per_dreturn_2015 = A04470 * (cpi2017 / cpi2015) / N04470
	)
	```


	2010 IRS data
	```{r}
	download.file("https://www.irs.gov/pub/irs-soi/2010zipcode.zip", destfile = "2010zipcode.zip")
	unzip("2010zipcode.zip", files = c("10zpallnoagi.csv", "10zpallagi.csv"))
	```

	```{r}
	# proportion of returns above 200K (caveat: this can't be adjusted for inflation)
	prop200k_2010 <- read_csv("10zpallagi.csv") %>%
	filter(
	STATE == "HI",
	zipcode != "00000",
	zipcode != "99999"
	) %>%
	mutate(zipcode = as.numeric(zipcode)) %>%
	group_by(zipcode) %>%
	mutate(prop200k_2010 = N1 / sum(N1) * 100) %>%
	filter(agi_stub == 6) %>%
	select(zipcode, prop200k_2010)

	agi_2010 <- read_csv("10zpallnoagi.csv") %>%
	filter(
	STATE == "HI",
	ZIPCODE != "00000",
	ZIPCODE != "99999"
	) %>%
	transmute(
	zipcode = as.numeric(ZIPCODE),
	return_count_2010 = N1,
	avg_agi_2010 = A00100 * (cpi2017 / cpi2010) / N1,
	wage_per_return_2010 = A00200 * (cpi2017 / cpi2010) / N1,
	wage_per_wreturn_2010 = A00200 * (cpi2017 / cpi2010) / N00200,
	deduct_per_return_2010 = A04470 * (cpi2017 / cpi2010) / N1,
	deduct_per_dreturn_2010 = A04470 * (cpi2017 / cpi2010) / N04470
	)
	```

	Create 5YR variables
	```{r}
	propk200k <- prop200k_2010 %>%
	full_join(prop200k_2015) %>%
	mutate(prop200k_5yr = (prop200k_2015 - prop200k_2010) / prop200k_2010 * 100) %>%
	select(zipcode, prop200k_2015, prop200k_5yr)
	agi <- agi_2010 %>%
	full_join(agi_2015) %>%
	mutate(
	return_count_5yr = (return_count_2015 - return_count_2010) / return_count_2010 * 100,
	avg_agi_5yr = (avg_agi_2015 - avg_agi_2010) / avg_agi_2010 * 100,
	wage_per_return_5yr = (wage_per_return_2015 - wage_per_return_2010) / wage_per_return_2010 * 100,
	wage_per_wreturn_5yr = (wage_per_wreturn_2015 - wage_per_wreturn_2010) / wage_per_wreturn_2010 * 100,
	deduct_per_return_5yr = (deduct_per_return_2015 - deduct_per_return_2010) / deduct_per_return_2010 * 100,
	deduct_per_dreturn_5yr = (deduct_per_dreturn_2015 - deduct_per_dreturn_2010) / deduct_per_dreturn_2010 * 100
	) %>%
	select(-contains("2010"))
	```

	Census residential household count
	Download CSV from https://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/H1/0400000US15.86000P
	```{r}
	housing_units <- read_csv("DEC_10_SF1_H1.csv") %>%
	transmute(zipcode = GEO.id2, housing_units = D001)
	```

	ZIP code names from DBEDT http://files.hawaii.gov/dbedt/economic/databook/2015-individual/01/011915.xls

	```{r}
	zipnames <- read_csv("011915_clean.csv")
	```

	```{r}
	combined_data <- agi %>%
	left_join(propk200k) %>%
	left_join(all_homes) %>%
	left_join(housing_units) %>%
	left_join(zipnames)
	write_csv(combined_data, path = "hi_richest_zip.csv", na = "")
	```