2021 model.Rmd

#2021 baseball model
library(baseballr)
library(reshape2)
library(tidyverse)
library(Lahman)
library(rvest)
library(xml2)
library(readr)

#get data for park_factors
park_factor <- fg_park(2021)


#teams data from Lahman database
teams <- Teams %>%  
  filter(yearID > 2000) %>%  
  select(teamID, yearID, lgID, G, W, L, R, RA)

#team stats data for analysis
team_stats <- Teams %>% 
  filter(yearID >= 2018) %>%
  select(teamID, yearID, lgID, G, W, L, R, RA, AB, H, X2B, X3B, HR, BB, SO, ER, ERA, HA, HRA, BBA, SOA, HBP, SF)

team_stats_10 <- Teams %>% 
  filter(yearID >= 2011) %>%
  select(teamID, yearID, lgID, G, W, L, R, RA, AB, H, X2B, X3B, HR, BB, SO, ER, ERA, HA, HRA, BBA, SOA, HBP, SF)

team_stats_20 <- Teams %>% 
  filter(yearID >= 2001) %>%
  select(teamID, yearID, lgID, G, W, L, R, RA, AB, H, X2B, X3B, HR, BB, SO, ER, ERA, HA, HRA, BBA, SOA, HBP, SF)

#earned run adjustment
earned_run_data <- Teams %>%
  filter(yearID > 2000) %>%
  select(teamID, yearID, RA,ER, E)

#2021 data for building model
TeamData2021 <- read.csv("~/GitHub/baseball model/2021TeamData.csv")

teams <- teams %>% 
  mutate (RD = R - RA, Wpct = W / (W + L))
  

run_diff <- ggplot(teams, aes(x = RD, y = Wpct)) + geom_point() + 
            scale_x_continuous('Run Differential') + 
            scale_y_continuous('Win Pct')
            
run_diff + geom_smooth(method = 'lm', se = F)


#check regression of win pct vs. run diff

lm_RD <- lm(Wpct ~ RD, data = teams)

summary(lm_RD)

###
<!-- Call: -->
<!-- lm(formula = Wpct ~ RD, data = teams) -->

<!-- Residuals: -->
<!--       Min        1Q    Median        3Q       Max  -->
<!-- -0.133702 -0.018865 -0.000675  0.017166  0.131268  -->

<!-- Coefficients: -->
<!--              Estimate Std. Error t value Pr(>|t|)     -->
<!-- (Intercept) 5.000e-01  1.154e-03  433.30   <2e-16 *** -->
<!-- RD          6.281e-04  1.055e-05   59.52   <2e-16 *** -->
<!-- --- -->
<!-- Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 -->

<!-- Residual standard error: 0.02826 on 598 degrees of freedom -->
<!-- Multiple R-squared:  0.8556,	Adjusted R-squared:  0.8553  -->
<!-- F-statistic:  3542 on 1 and 598 DF,  p-value: < 2.2e-16 -->
###

#check residuals
library(lmtest)
library(fBasics)
dwtest(lm_RD)
jarqueberaTest(lm_RD$resid) #Test residuals for normality
resid_RD <- lm_RD$residuals

plot(lm_RD, which = 1, col = c('blue'))
plot(lm_RD, which = 2, col = c('red'))

#residuals look normal and randomly distributed

#win pct = 0.5000 + 0.0006281 * run_diff

#expected Wpct
teams <- teams %>% 
  mutate(ExpWpct = R ^ 1.83 / (R^1.83 + RA ^ 1.83))

teams <- teams %>%
  mutate(resExpWpct = Wpct - ExpWpct)
  
teams %>%
  summarize(rmse = sqrt(mean(resExpWpct^2)))
  
#rmse = 0.02572

#incremental runs needed for a win
IR <- function(RS = 5, RA = 5){
      (RS^2 + RA^2)^2 / (2 * RS * RA^2)
}

ir_table <- expand_grid(RS = seq(3,6,0.5), RA = seq(3,6,.5))

ir_table %>%
  mutate(IRW = IR(RS,RA)) %>%
  spread(key = RA, value = IRW, sep = '=') %>%
  round(1)
  
<!-- # A tibble: 7 x 8 -->
<!--      RS `RA=3` `RA=3.5` `RA=4` `RA=4.5` `RA=5` `RA=5.5` `RA=6` -->
<!--   <dbl>  <dbl>    <dbl>  <dbl>    <dbl>  <dbl>    <dbl>  <dbl> -->
<!-- 1   3      6        6.1    6.5      7      7.7      8.5    9.4 -->
<!-- 2   3.5    7.2      7      7.1      7.5    7.9      8.5    9.2 -->
<!-- 3   4      8.7      8.1    8        8.1    8.4      8.8    9.4 -->
<!-- 4   4.5   10.6      9.6    9.1      9      9.1      9.4    9.8 -->
<!-- 5   5     12.8     11.3   10.5     10.1   10       10.1   10.3 -->
<!-- 6   5.5   15.6     13.4   12.2     11.4   11.1     11     11.1 -->
<!-- 7   6     18.8     15.8   14.1     13     12.4     12.1   12 -->

#teams data for regression from Lahman database

team_stats <- team_stats %>%
  mutate(hits_per_run = H / R,
  OBP = (H + BB + HBP)/ (AB + BB + HBP + SF),
  SLG = ((H - X2B - X3B - HR) + (X2B * 2) + (X3B * 3) + (HR * 4)) / AB,
  ISO = SLG - (H / (AB - BB - HBP - SF)),
  BBR = BB / AB,
  runs_earnedruns = RA / ER)

team_stats_10 <- team_stats_10 %>%
  mutate(hits_per_run = H / R,
  OBP = (H + BB + HBP)/ (AB + BB + HBP + SF),
  SLG = ((H - X2B - X3B - HR) + (X2B * 2) + (X3B * 3) + (HR * 4)) / AB,
  ISO = SLG - (H / (AB - BB - HBP - SF)),
  BBR = BB / AB,
  runs_earnedruns = RA / ER)

team_stats_20 <- team_stats_20 %>%
  mutate(hits_per_run = H / R,
  OBP = (H + BB + HBP)/ (AB + BB + HBP + SF),
  SLG = ((H - X2B - X3B - HR) + (X2B * 2) + (X3B * 3) + (HR * 4)) / AB,
  ISO = SLG - (H / (AB - BB - HBP - SF)),
  BBR = BB / AB,
  runs_earnedruns = RA / ER)


team_stats %>%
  group_by (teamID) %>%
  summarize(SLGPCT = mean(SLG))
  
  
#run regressions
reg_hits_per_run <- lm(hits_per_run ~ OBP + SLG + BBR, data = team_stats)
summary(reg_hits_per_run)
reg_hits_per_run_10 <- lm(hits_per_run ~ OBP + SLG + BBR, data = team_stats_10)
summary(reg_hits_per_run)
reg_hits_per_run_20 <- lm(hits_per_run ~ OBP + SLG + BBR, data = team_stats_20)
summary(reg_hits_per_run)
summary(reg_hits_per_run_10)
summary(reg_hits_per_run_20)


<!-- Residuals: -->
<!--       Min        1Q    Median        3Q       Max  -->
<!-- -0.162491 -0.045088  0.003764  0.042388  0.218219  -->

<!-- Coefficients: -->
<!--             Estimate Std. Error t value Pr(>|t|)     -->
<!-- (Intercept)   2.7958     0.2180  12.827  < 2e-16 *** -->
<!-- OBP           4.4987     1.2477   3.605 0.000522 *** -->
<!-- SLG          -3.5490     0.4768  -7.443 6.98e-11 *** -->
<!-- BBR          -9.4799     0.7453 -12.719  < 2e-16 *** -->
<!-- --- -->
<!-- Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 -->

<!-- Residual standard error: 0.07468 on 86 degrees of freedom -->
<!-- Multiple R-squared:  0.8015,	Adjusted R-squared:  0.7946  -->
<!-- F-statistic: 115.8 on 3 and 86 DF,  p-value: < 2.2e-16 -->

#check residuals
jarqueberaTest(reg_hits_per_run$resid) #Test residuals for normality
resid_reg_hits <- reg_hits_per_run$residuals

plot(reg_hits_per_run, which = 1, col = c('blue'))
plot(reg_hits_per_run, which = 2, col = c('red'))

#residuals look random and normallly distributed

#runs / hit = 2.7958 + 4.4987 * OBP - 3.549 * SLG - 9.4799 * BBR

TeamData2021 <- TeamData2021 %>%
  mutate(BBR = BB / AB,
  ISO = SLG - AVG,
  hits_per_run = H / R,
  exp_hits_per_run = 2.7958 + 4.4987 * OBP - 3.549 * SLG - 9.4799 * BBR,
  exp_runs = H / exp_hits_per_run,
  exp_runs_scored_diff = exp_runs - R)
  
#run regression
reg_hits_per_run_2021 <- lm(hits_per_run ~ OBP + SLG + BBR + ISO, data = TeamData2021)
summary(reg_hits_per_run_2021)

#combine data
data20112020 <- team_stats_10 %>%
  select(OBP, SLG, BBR, ISO, hits_per_run)
data20012020 <- team_stats_20 %>%
  select(OBP, SLG, BBR, ISO, hits_per_run)
data20182020 <- team_stats %>%
  select(OBP, SLG, BBR, ISO, hits_per_run)
data2021 <- TeamData2021 %>%
  select(OBP, SLG, BBR, ISO, hits_per_run)
  
full_data <- rbind(data20182020, data2021)
full_data_10 <- rbind(data20112020, data2021)
full_data_20 <- rbind(data20012020, data2021)
full_reg <- lm(hits_per_run ~ OBP + SLG + BBR + ISO, data = full_data)
summary(full_reg)
full_reg_10 <- lm(hits_per_run ~ OBP + SLG + BBR + ISO, data = full_data_10)
summary(full_reg_10)
full_reg_20 <- lm(hits_per_run ~ OBP + SLG + BBR + ISO, data = full_data_20)
summary(full_reg_20)

# runs / hit = 2.7539 + 4.0603 * OBP - 2.8383 * SLG - 8.9444 * BBR - 1.20 * ISO

mean_hits_per_run <- mean(full_data$hits_per_run)
quantile_hits_per_run <- quantile(full_data$hits_per_run)
mean_runs_unearnedruns <- mean(team_stats$runs_earnedruns)
quantile_runs_unearned_runs <- quantile(team_stats$runs_earnedruns)


#run data again
TeamData2021 <- TeamData2021 %>%
  mutate(BBR = BB / AB,
  hits_per_run = H / R,
  exp_hits_per_run = 2.7539 + 4.0603 * OBP - 2.8383 * SLG - 8.9444 * BBR - 1.20 * ISO,
  exp_runs = H / exp_hits_per_run,
  exp_runs_scored_diff = exp_runs - R)
  
#density of h/r
p <- ggplot(TeamData2021, aes(x=hits_per_run)) + 
  geom_density()
p
# Add mean line
p + geom_vline(aes(xintercept=mean(hits_per_run)),
            color="blue", linetype="dashed", size=1)
            
mean_hits_per_run_2021 <- mean(TeamData2021$hits_per_run)
quantile_hits_per_run_2021 <- quantile(TeamData2021$hits_per_run)


#earned runs analysis
by_year <- earned_run_data %>%
  group_by(yearID) %>%
  summarize(yearly_avg = mean(RA/ER))

p2 <- ggplot(by_year, aes(x = yearID, y = yearly_avg - 1)) + geom_line()
p2 + ggtitle('Percentage of unearned runs by year')
mean(by_year$yearly_avg)

reg_ER <- lm(RA/ER~E, data = earned_run_data)