Defensive Credit & WAR
hc_x and hc_y: coordinates where batted ball is first fieldedhit_location: position of player who first fielding ball(125, 200)def_atbat2024: at-bat level data table containing
type and events: i.e. end_type and end_events
desend_type: did last pitch of at-bat end in Strike, Ball, or contact (X)?
events: more granular description of at-bat outcome
All events when contact is made
double double_play field_error
7608 336 1093
field_out fielders_choice fielders_choice_out
72233 373 306
force_out grounded_into_double_play home_run
3408 3152 5326
sac_bunt sac_fly sac_fly_double_play
446 1222 13
single triple triple_play
25363 685 1
<NA>
0
events for balls and strikesbip <- def_atbat2024 |> dplyr::filter(end_type == "X")
bip$end_events[65138] <- "fielders_choice_out"
out_events <-
c("double_play", "field_out", "fielders_choice_out",
"force_out", "grounded_into_double_play",
"sac_bunt", "sac_fly", "sac_fly_double_play",
"triple_play")
bip <-
bip |>
dplyr::filter(end_events != "home_run" & !is.na(x) & !is.na(y)) |>
dplyr::mutate(Out = ifelse(end_events %in% out_events, 1, 0)) bin_probs <-
bip |>
dplyr::select(x, y, Out) |>
dplyr::mutate(
x_bin = cut(x, breaks = seq(-300-grid_sep/2, 300+grid_sep/2, by = grid_sep)),
y_bin = cut(y, breaks = seq(-100 - grid_sep/2, 500+grid_sep/2, by = grid_sep))) |>
dplyr::group_by(x_bin, y_bin) |>
dplyr::summarise(
out_prob = mean(Out),
n_balls = dplyr::n(),
.groups = "drop")\[ \log\left(\frac{\mathbb{P}(\textrm{out})}{1 - \mathbb{P}(\textrm{out})} \right) = s(x,y), \]
mgcv::bam for large datasetsall_preds <-
predict(object = gam_fit,
newdata = def_atbat2024,
type = "response")
def_atbat2024$p_out <- all_preds
def_atbat2024 <-
def_atbat2024 |>
dplyr::mutate(
p_out = dplyr::case_when(
is.na(p_out) & end_events == "home_run" ~ 0,
is.na(p_out) & end_type != "X" ~ 0,
.default = p_out)) |>
dplyr::filter(!is.na(p_out))# A tibble: 10 × 3
Name key_mlbam RAA_p
<chr> <int> <dbl>
1 Tarik Skubal 669373 14.7
2 Chris Sale 519242 14.6
3 Ryan Walker 676254 13.5
4 Cade Smith 671922 13.3
5 Paul Skenes 694973 12.3
6 Emmanuel Clase 661403 11.2
7 Kirby Yates 489446 9.99
8 Griffin Jax 643377 9.53
9 Edwin Uceta 670955 9.23
10 Garrett Crochet 676979 9.13
raa_f3 |>
dplyr::inner_join(y = player2024_lookup, by = "key_mlbam") |>
dplyr::select(Name, RAA_f3) |>
dplyr::arrange(dplyr::desc(RAA_f3)) |>
dplyr::slice_head(n = 10)# A tibble: 10 × 2
Name RAA_f3
<chr> <dbl>
1 Carlos Santana 55.7
2 Christian Walker 54.6
3 Paul Goldschmidt 52.4
4 Bryce Harper 51.1
5 Matt Olson 51.0
6 Ryan Mountcastle 48.7
7 Vladimir Guerrero 47.1
8 Michael Toglia 46.5
9 Freddie Freeman 46.2
10 Josh Naylor 46.2
raa <-
raa_b |>
dplyr::select(-Name) |>
dplyr::full_join(y = raa_br |> dplyr::select(-Name), by = "key_mlbam") |>
dplyr::full_join(y = raa_p |> dplyr::select(-Name), by = "key_mlbam") |>
dplyr::full_join(y = raa_f |> dplyr::select(-Name), by = "key_mlbam") |>
tidyr::replace_na(list(RAA_b = 0, RAA_br = 0, RAA_f = 0, RAA_p = 0)) |>
dplyr::mutate(RAA = RAA_b + RAA_br + RAA_f + RAA_p) |>
dplyr::left_join(y = player2024_lookup |> dplyr::select(key_mlbam, Name), by = "key_mlbam") |>
dplyr::select(Name, key_mlbam, RAA, RAA_b, RAA_br, RAA_f, RAA_p)# A tibble: 10 × 7
Name key_mlbam RAA RAA_b RAA_br RAA_f RAA_p
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Bobby Witt 677951 165. 61.1 24.9 78.7 0
2 Gunnar Henderson 683002 117. 44.8 -2.82 75.0 0
3 Elly De La Cruz 682829 114. 19.4 15.4 79.4 0
4 Jose Ramirez 608070 113. 30.9 32.7 49.7 0
5 Zach Neto 687263 110. 3.57 15.4 91.2 0
6 Marcus Semien 543760 108. 0.501 17.7 89.6 0
7 Ketel Marte 606466 101. 35.2 10.5 55.3 0
8 Vladimir Guerrero 665489 99.0 47.3 -0.734 52.4 0
9 Francisco Lindor 596019 98.5 30.6 0.699 67.2 0
10 Jose Altuve 514888 98.2 19.3 13.9 65.0 0
Replacement level is simply the level of production you could get from a player that would cost you nothing but the league minimum salary to acquire.
we believe that a team making the MLB minimum would win about 29.7% of its games in a give year, or roughly 47-48 per team
all_players <- unique(
c(def_atbat2024$batter, def_atbat2024$pitcher, def_atbat2024$fielder_2,
def_atbat2024$fielder_3, def_atbat2024$fielder_4, def_atbat2024$fielder_5,
def_atbat2024$fielder_6, def_atbat2024$fielder_7, def_atbat2024$fielder_8, def_atbat2024$fielder_9))
pitchers <- unique(def_atbat2024$pitcher)
position_players <- all_players[!all_players %in% pitchers]position_pa <-
def_atbat2024 |>
dplyr::filter(batter %in% position_players) |>
dplyr::group_by(batter) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(dplyr::desc(n)) |>
dplyr::rename(key_mlbam = batter)
pitcher_pa <-
def_atbat2024 |>
dplyr::group_by(pitcher) |>
dplyr::summarise(n = dplyr::n()) |>
dplyr::arrange(dplyr::desc(n)) |>
dplyr::rename(key_mlbam = pitcher)repl_position_raa <-
raa |>
dplyr::filter(key_mlbam %in% repl_position_players) |>
dplyr::inner_join(y = position_pa, by = "key_mlbam") |>
dplyr::select(Name, key_mlbam, RAA, n)
repl_pitch_raa <-
raa |>
dplyr::filter(key_mlbam %in% repl_pitchers) |>
dplyr::inner_join(y = pitcher_pa, by = "key_mlbam") |>
dplyr::select(Name, key_mlbam, RAA, n)
repl_avg_pos <- sum(repl_position_raa$RAA)/sum(repl_position_raa$n)
repl_avg_pitch <- sum(repl_pitch_raa$RAA)/sum(repl_pitch_raa$n)
cat("Replacement-level per-at-bat RAA (position players):", round(repl_avg_pos, digits = 4), "\n")
cat("Replacement-level per-at-bat RAA (pitchers):", round(repl_avg_pitch, digits = 4),"\n")Replacement-level per-at-bat RAA (position players): -0.002
Replacement-level per-at-bat RAA (pitchers): -0.0243
# A tibble: 10 × 6
Name key_mlbam RAA n shadowRAA WAR
<chr> <dbl> <dbl> <int> <dbl> <dbl>
1 Bobby Witt 677951 165. 694 -1.36 16.6
2 Gunnar Henderson 683002 117. 702 -1.37 11.8
3 Elly De La Cruz 682829 114. 679 -1.33 11.5
4 Jose Ramirez 608070 113. 657 -1.28 11.5
5 Zach Neto 687263 110. 590 -1.15 11.1
6 Marcus Semien 543760 108. 701 -1.37 10.9
7 Ketel Marte 606466 101. 562 -1.10 10.2
8 Vladimir Guerrero 665489 99.0 671 -1.31 10.0
9 Francisco Lindor 596019 98.5 689 -1.35 9.98
10 Jose Altuve 514888 98.2 661 -1.29 9.95