Skip to content

Instantly share code, notes, and snippets.

@ensley
Last active November 18, 2015 02:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ensley/c3889b285d2327c01e23 to your computer and use it in GitHub Desktop.
Save ensley/c3889b285d2327c01e23 to your computer and use it in GitHub Desktop.
MLB payroll and wins
library(ggplot2)
library(ggthemes)
library(dplyr)
# load payroll and win/loss data
df <- read.csv('mlb-standings-and-payroll.csv')
# pick out seasons from 1985 on
df <- df %>%
filter(year >= 1985) %>%
select(tm, year, w, g, wins_losses, est_payroll) %>%
tbl_df()
# fix up old team name abbreviations
team.lookups <- read.csv('team-lookups.csv', stringsAsFactors=F)
df <- left_join(df, team.lookups, by=c('tm' = 'historic_team'))
# add team color data
team.colors <- read.csv('team-colors.csv', stringsAsFactors=F)
df <- left_join(df, team.colors, by=c('modern_team' = 'tm'))
# add payroll percentile for each team, rather than z-score
df <- df %>%
dplyr::group_by(year) %>%
dplyr::mutate(rank=percent_rank(est_payroll))
df$division <- as.factor(df$division)
divisions <- levels(df$division)
# create the plots, one division at a time
for(div in divisions) {
df.division <- filter(df, division==div)
p <- ggplot(df.division, aes(x=rank, y=wins_losses, color=team_color)) +
geom_point(alpha=0.75, size=4) +
geom_hline(yintercept=0.5) + geom_vline(xintercept=0.5) +
stat_smooth(data=within(df, modern_team <- NULL), color='grey', size=1,
method='lm', formula = y ~ poly(x, 2), se=F) +
stat_smooth(size=2, method='lm', formula = y ~ poly(x, 2), se=F) +
scale_color_identity() +
scale_x_continuous(name='Standardized Salary\n(#of standard deviations from yearly mean',
breaks=c(0, 0.5, 1), limit=c(-0.1,1.1), labels=c('0%','50%','100%')) +
scale_y_continuous(name='Win/Loss %', breaks=seq(0.3, 0.7, 0.1), limit=c(0.25, 0.75)) +
facet_wrap(~modern_team, ncol=5, scales='free_x') +
theme_fivethirtyeight() +
ggtitle(div)
ggsave(filename=paste0(div, ".png"), plot=p, width=15, height=4)
}
fit <- lm(wins_losses ~ poly(rank, 2), data=df)
df <- dplyr::mutate(df, expected_winpct = predict(fit, newdata=data.frame(rank=rank)))
df <- dplyr::mutate(df, expected_w = expected_winpct*g)
df <- dplyr::mutate(df, diff_w = w - expected_w)
df <- dplyr::mutate(df, diff_winpct = wins_losses - expected_winpct)
df <- dplyr::mutate(df, posneg = as.factor(sign(diff_winpct)))
rankings <- df %>% group_by(modern_team) %>% summarise(avg_diff=mean(diff_w)) %>% arrange(desc(avg_diff))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment