In [204]:
import sys

# Check which Python your kernel uses
print("Kernel Python:", sys.executable)

# Reinstall pip (in case it's missing)
!{sys.executable} -m ensurepip --upgrade

# Upgrade pip to latest
!{sys.executable} -m pip install --upgrade pip

# Install all required libraries into this exact environment
!{sys.executable} -m pip install numpy pandas matplotlib seaborn scipy statsmodels

# Confirm installs worked
import numpy, pandas, matplotlib, seaborn, scipy, statsmodels
Kernel Python: /Library/Frameworks/Python.framework/Versions/3.9/bin/python3.9
Looking in links: /var/folders/v8/v1s_rd8j1jqb2sr4fz1kdrqm0000gn/T/tmpxqas5_x1
Requirement already satisfied: setuptools in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (58.1.0)
Requirement already satisfied: pip in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (25.2)
Requirement already satisfied: pip in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (25.2)
Requirement already satisfied: numpy in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (2.0.2)
Requirement already satisfied: pandas in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (2.2.3)
Requirement already satisfied: matplotlib in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (3.9.4)
Requirement already satisfied: seaborn in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (0.13.2)
Requirement already satisfied: scipy in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (1.13.1)
Requirement already satisfied: statsmodels in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (0.14.5)
Requirement already satisfied: python-dateutil>=2.8.2 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from pandas) (2025.1)
Requirement already satisfied: tzdata>=2022.7 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from pandas) (2025.1)
Requirement already satisfied: contourpy>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (4.60.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (25.0)
Requirement already satisfied: pillow>=8 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (11.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (3.2.5)
Requirement already satisfied: importlib-resources>=3.2.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from matplotlib) (6.5.2)
Requirement already satisfied: patsy>=0.5.6 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from statsmodels) (1.0.2)
Requirement already satisfied: zipp>=3.1.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.23.0)
Requirement already satisfied: six>=1.5 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
In [205]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
import matplotlib.patheffects as pe
import seaborn as sns

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.formula.api as smf


sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.figsize"] = (10,6)

Read in the data set

In [206]:
ABs = pd.read_csv("Every_AB_ordered.csv")

Create Job Opportunity/Job Result columns

In [207]:
# --- Job Opportunity flags ---
on2  = ABs["on_2b"].notna()
on3  = ABs["on_3b"].notna()
outs = ABs["outs_when_up"]

ABs["job_opportunity"] = (on2 & (outs == 0)) | (on3 & outs.isin([0, 1]))

# --- Next-PA context within the same game ---
grp = ABs.groupby("game_pk", sort=False)
ABs["_next_on_3b"]     = grp["on_3b"].shift(-1)
ABs["_next_bat_score"] = grp["bat_score"].shift(-1)
ABs["_runs_scored"]    = ABs["_next_bat_score"] - ABs["bat_score"]

# --- Focus: 3rd takes precedence if present at PA start ---
focus_third  = on3 & outs.isin([0, 1])
focus_second = on2 & (outs == 0) & (~focus_third)

# --- Success (by next PA state) ---
third_success  = focus_third & (ABs["_runs_scored"] > 0)
second_success = focus_second & (
    ABs["_next_on_3b"].notna() | (ABs["_runs_scored"] > 0)
)
success = third_success | second_success

# --- Neutral causes: WP / PB / Steal / DI / Balk ---
ev = ABs["events"].astype(str).str.lower()
neutral_exact = {"wild_pitch", "passed_ball", "balk"}
neutral_prefix = ("stolen_base", "defensive_indifference")
neutral_cause = ev.isin(neutral_exact) | ev.str.startswith(neutral_prefix)

# --- Neutral rule: batter reaches safely + no outs on play + focus runner doesn’t advance ---
safe_events = {
    "single","double","triple","home_run",
    "walk","intent_walk","hit_by_pitch",
    "catcher_interference","catcher_interf",
    "fielder_choice","fielders_choice",
    "field_error","reached_on_error"
}
batter_safe = ev.isin(safe_events)

if "outs_on_play" in ABs.columns:
    no_outs_on_play = ABs["outs_on_play"].fillna(0).astype("Int64").eq(0)
else:
    # Approximation when outs_on_play is missing
    no_outs_on_play = ev.isin({"single","double","triple","home_run","walk","intent_walk","hit_by_pitch","catcher_interference","catcher_interf"})

third_did_not_advance  = focus_third  & ~(ABs["_runs_scored"] > 0) & ABs["_next_on_3b"].notna()
second_did_not_advance = focus_second &  ABs["_next_on_3b"].isna()

neutral_batter_reach_no_adv = (
    ABs["job_opportunity"] & batter_safe & no_outs_on_play &
    (third_did_not_advance | second_did_not_advance)
)

neutral = ABs["job_opportunity"] & (neutral_cause | neutral_batter_reach_no_adv)

# --- Final categorical result with precedence: N/A > Neutral > Success > Fail ---
ABs["job_result"] = "N/A"
mask_jo = ABs["job_opportunity"]

ABs.loc[mask_jo & neutral,              "job_result"] = "Neutral"
ABs.loc[mask_jo & ~neutral & success,   "job_result"] = "Success"
ABs.loc[mask_jo & ~neutral & ~success,  "job_result"] = "Fail"

# (Optional) clean up helper cols; comment this out if you want to keep them
ABs.drop(columns=["_next_on_3b","_next_bat_score","_runs_scored"], inplace=True, errors="ignore")

Create home_team_record and away_team_record (pre-game W–L) columns

In [208]:
# --- 0) Remove any prior record columns so we don't get _x/_y suffixes
for c in [c for c in ABs.columns if c.startswith("home_team_record") or c.startswith("away_team_record")]:
    ABs.drop(columns=c, inplace=True)

# --- 1) Build one row per game with start time and final score from your data
game_summary = (
    ABs.sort_values(["game_pk", "ab_seq"], kind="mergesort")
      .groupby("game_pk", as_index=False)
      .agg({
          "home_team": "first",
          "away_team": "first",
          "__game_dt__": "min",
          "home_score": "last",
          "away_score": "last",
          "post_home_score": "last",
          "post_away_score": "last"
      })
)

# Prefer post_* scores (true finals) and fall back to the last running scores if needed
game_summary["final_home"] = game_summary["post_home_score"].where(
    game_summary["post_home_score"].notna(), game_summary["home_score"]
)
game_summary["final_away"] = game_summary["post_away_score"].where(
    game_summary["post_away_score"].notna(), game_summary["away_score"]
)

# Determine home/away result; ties (rare) -> None (no W/L change)
game_summary["home_result"] = None
game_summary.loc[game_summary["final_home"] > game_summary["final_away"], "home_result"] = "W"
game_summary.loc[game_summary["final_home"] < game_summary["final_away"], "home_result"] = "L"
game_summary["away_result"] = game_summary["home_result"].map({"W": "L", "L": "W", None: None})

# --- 2) Build per-team schedule and compute record *entering* each game
home_log = game_summary[["game_pk", "__game_dt__", "home_team", "home_result"]].rename(
    columns={"home_team": "team", "home_result": "result"}
)
away_log = game_summary[["game_pk", "__game_dt__", "away_team", "away_result"]].rename(
    columns={"away_team": "team", "away_result": "result"}
)

team_log = (
    pd.concat([home_log, away_log], ignore_index=True)
      .sort_values(["team", "__game_dt__", "game_pk"], kind="mergesort")
      .reset_index(drop=True)
)

# Map of (team, game_pk) -> "W-L" record entering that game
record_before_map = {}
for team, grp in team_log.groupby("team", sort=False):
    w = 0
    l = 0
    for _, row in grp.iterrows():
        record_before_map[(team, row["game_pk"])] = f"{w}-{l}"
        if row["result"] == "W":
            w += 1
        elif row["result"] == "L":
            l += 1

# --- 3) Attach EXACTLY TWO columns to every PA row in ABs via merges (fast)
records_df = pd.DataFrame(
    [(t, g, r) for (t, g), r in record_before_map.items()],
    columns=["team", "game_pk", "record_before"]
)

ABs = ABs.merge(
    records_df.rename(columns={"team": "home_team", "record_before": "home_team_record"}),
    on=["home_team", "game_pk"], how="left"
)
ABs = ABs.merge(
    records_df.rename(columns={"team": "away_team", "record_before": "away_team_record"}),
    on=["away_team", "game_pk"], how="left"
)

Create season-to-date team runs before each PA: Total_Runs_Home, Total_Runs_Away

In [209]:
# Ensure batting team is known for each PA
if "half_order" in ABs.columns:
    # convention from earlier: 0 = Top (away bats), 1 = Bottom (home bats)
    ABs["_batting_team"] = np.where(ABs["half_order"].astype(int) == 0, ABs["away_team"], ABs["home_team"])
else:
    # fallback to string column if present
    ABs["_batting_team"] = np.where(
        ABs["inning_topbot"].str.lower().str.startswith("top"),
        ABs["away_team"], ABs["home_team"]
    )

# Runs scored in the *previous PA* for the batting team (>=0), over the full season order
ABs["_runs_scored_prev_pa"] = (
    ABs.groupby("_batting_team", sort=False)["bat_score"]
       .diff()
       .fillna(0)
       .clip(lower=0)
)

# Cumulative team runs *before* this PA for the batting team
ABs["_team_runs_before_this_pa"] = (
    ABs.groupby("_batting_team", sort=False)["_runs_scored_prev_pa"].cumsum()
)

# Map that cumulative "before" total onto home/away for every row.
# When a side isn't batting this row, forward-fill its last known season total.
home_before_when_batting = np.where(
    ABs["_batting_team"].values == ABs["home_team"].values,
    ABs["_team_runs_before_this_pa"].values,
    np.nan
)
away_before_when_batting = np.where(
    ABs["_batting_team"].values == ABs["away_team"].values,
    ABs["_team_runs_before_this_pa"].values,
    np.nan
)

ABs["_home_tmp"] = home_before_when_batting
ABs["_away_tmp"] = away_before_when_batting

# Forward-fill within each team across the season; start at 0 if a team hasn't batted yet
ABs["Total_Runs_Home"] = (
    ABs.groupby("home_team", sort=False)["_home_tmp"].ffill().fillna(0).astype(int)
)
ABs["Total_Runs_Away"] = (
    ABs.groupby("away_team", sort=False)["_away_tmp"].ffill().fillna(0).astype(int)
)

# Clean up helpers
ABs.drop(columns=["_batting_team", "_runs_scored_prev_pa", "_team_runs_before_this_pa", "_home_tmp", "_away_tmp"],
         inplace=True, errors="ignore")

Create team-game and team-season Job% tables, plus basic correlations

In [210]:
# helper to compute job stats for a group (team's OWN PAs only)
def job_stats(g):
    successes = (g["job_result"] == "Success").sum()
    fails     = (g["job_result"] == "Fail").sum()
    total     = successes + fails
    return pd.Series({
        "job_success": successes,
        "job_fail": fails,
        "job_total": total,
        "job_pct": successes / total if total > 0 else np.nan
    })

# ensure batting team column exists
if "_batting_team" not in ABs.columns:
    if "half_order" in ABs.columns:
        ABs["_batting_team"] = np.where(ABs["half_order"].astype(int) == 0, ABs["away_team"], ABs["home_team"])
    else:
        ABs["_batting_team"] = np.where(
            ABs["inning_topbot"].astype(str).str.lower().str.startswith("top"),
            ABs["away_team"], ABs["home_team"]
        )

# home teams: filter to HOME batting PAs only
home_game = (
    ABs[ABs["_batting_team"] == ABs["home_team"]]
      .groupby(["game_pk", "home_team"])
      .apply(job_stats)
      .reset_index()
      .rename(columns={"home_team":"team"})
)

# attach runs scored and win/loss for home
home_game = home_game.merge(
    ABs.groupby("game_pk").agg(
        runs_home=("post_home_score","max"),
        runs_away=("post_away_score","max")
    ).reset_index(),
    on="game_pk",
    how="left"
)
home_game["runs_scored"]  = home_game["runs_home"]
home_game["runs_allowed"] = home_game["runs_away"]
home_game["win"] = (home_game["runs_scored"] > home_game["runs_allowed"]).astype(int)

# away teams: filter to AWAY batting PAs only
away_game = (
    ABs[ABs["_batting_team"] == ABs["away_team"]]
      .groupby(["game_pk", "away_team"])
      .apply(job_stats)
      .reset_index()
      .rename(columns={"away_team":"team"})
)

# attach runs scored and win/loss for away
away_game = away_game.merge(
    ABs.groupby("game_pk").agg(
        runs_home=("post_home_score","max"),
        runs_away=("post_away_score","max")
    ).reset_index(),
    on="game_pk",
    how="left"
)
away_game["runs_scored"]  = away_game["runs_away"]
away_game["runs_allowed"] = away_game["runs_home"]
away_game["win"] = (away_game["runs_scored"] > away_game["runs_allowed"]).astype(int)

# combine into one team-game table
team_game = pd.concat([home_game, away_game], ignore_index=True)

# --- Season-level aggregation (team totals) ---
team_season = (
    team_game.groupby("team").agg(
        job_success=("job_success","sum"),
        job_fail=("job_fail","sum"),
        runs_scored=("runs_scored","sum"),
        wins=("win","sum"),
        games=("game_pk","nunique")
    )
    .reset_index()
)
team_season["job_total"] = team_season["job_success"] + team_season["job_fail"]
team_season["job_pct"]   = np.where(team_season["job_total"]>0,
                                    team_season["job_success"]/team_season["job_total"], np.nan)
team_season["win_pct"]   = team_season["wins"] / team_season["games"]

# --- Correlations ---
season_corr_runs = team_season["job_pct"].corr(team_season["runs_scored"])
season_corr_wins = team_season["job_pct"].corr(team_season["win_pct"])
game_corr_runs   = team_game["job_pct"].corr(team_game["runs_scored"])
game_corr_wins   = team_game["job_pct"].corr(team_game["win"])

print("=== Season-level correlations ===")
print("Job% vs Runs Scored:", season_corr_runs)
print("Job% vs Win%:", season_corr_wins)

print("\n=== Game-level correlations ===")
print("Job% vs Runs Scored:", game_corr_runs)
print("Job% vs Win:", game_corr_wins)
/var/folders/v8/v1s_rd8j1jqb2sr4fz1kdrqm0000gn/T/ipykernel_95439/698466531.py:25: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  ABs[ABs["_batting_team"] == ABs["home_team"]]
=== Season-level correlations ===
Job% vs Runs Scored: 0.31880741574317706
Job% vs Win%: 0.37608931409621055

=== Game-level correlations ===
Job% vs Runs Scored: 0.28298807557121575
Job% vs Win: 0.1519981255047593
/var/folders/v8/v1s_rd8j1jqb2sr4fz1kdrqm0000gn/T/ipykernel_95439/698466531.py:47: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  ABs[ABs["_batting_team"] == ABs["away_team"]]

Create cumulative runs allowed columns from the already-built Total_Runs_Home/Away

In [211]:
# Make sure runs are numeric
ABs["Total_Runs_Home"] = pd.to_numeric(ABs["Total_Runs_Home"], errors="coerce")
ABs["Total_Runs_Away"] = pd.to_numeric(ABs["Total_Runs_Away"], errors="coerce")

# Create new cumulative runs allowed columns
ABs["Total_Allowed_Home"] = ABs["Total_Runs_Away"]
ABs["Total_Allowed_Away"] = ABs["Total_Runs_Home"]

Build team OBP/SLG at game- and season-level from own PAs

In [212]:
# -----------------------------
# 0) Identify batting team & game key
# -----------------------------
if "half_order" in ABs.columns:
    _top_half = ABs["half_order"].astype(int).eq(0)  # 0 = Top (away bats)
elif "inning_topbot" in ABs.columns:
    _top_half = ABs["inning_topbot"].astype(str).str.lower().str.startswith("top")
else:
    raise ValueError("Need either 'half_order' or 'inning_topbot' to determine batting team.")

ABs["batting_team"] = np.where(_top_half, ABs["away_team"], ABs["home_team"])

if "game_pk" in ABs.columns:
    ABs["_game_key"] = ABs["game_pk"]
elif "game_id" in ABs.columns:
    ABs["_game_key"] = ABs["game_id"]
else:
    dates = ABs.get("__game_dt__", ABs.get("game_date"))
    ABs["_game_key"] = pd.factorize(list(zip(pd.to_datetime(dates), ABs["home_team"], ABs["away_team"])))[0]

# -----------------------------
# 1) Scoring components per PA
# -----------------------------
ev = ABs["events"].astype(str).str.lower()

# Non-AB events (official): BB, IBB, HBP, SF, SH/SAC_BUNT, CI
NON_AB = {"walk","intent_walk","hit_by_pitch","sac_fly","sac_bunt","catcher_interference","catcher_interf"}
ABs["is_ab"]  = (~ev.isin(NON_AB)).astype(int)
ABs["is_hit"] = ev.isin({"single","double","triple","home_run"}).astype(int)
ABs["bb"]     = ev.eq("walk").astype(int) | ev.eq("intent_walk").astype(int)
ABs["hbp"]    = ev.eq("hit_by_pitch").astype(int)
ABs["sf"]     = ev.eq("sac_fly").astype(int)

ABs["tb"] = (
    ev.eq("single").astype(int)*1 +
    ev.eq("double").astype(int)*2 +
    ev.eq("triple").astype(int)*3 +
    ev.eq("home_run").astype(int)*4
)

# -----------------------------
# 2) TEAM–GAME aggregates with OBP & SLG
# -----------------------------
team_game_obpslg = (
    ABs.groupby(["_game_key","batting_team"], as_index=False)
      .agg(AB=("is_ab","sum"), H=("is_hit","sum"), BB=("bb","sum"),
           HBP=("hbp","sum"), SF=("sf","sum"), TB=("tb","sum"))
)

team_game_obpslg["OBP"] = (
    (team_game_obpslg["H"] + team_game_obpslg["BB"] + team_game_obpslg["HBP"]) /
    (team_game_obpslg["AB"] + team_game_obpslg["BB"] + team_game_obpslg["HBP"] + team_game_obpslg["SF"])
      .replace(0, pd.NA)
).fillna(0.0)

team_game_obpslg["SLG"] = (
    team_game_obpslg["TB"] / team_game_obpslg["AB"].replace(0, pd.NA)
).fillna(0.0)

team_game_obpslg = team_game_obpslg.rename(columns={
    "_game_key": "game_key",
    "batting_team": "team",
    "OBP": "OBP_game",
    "SLG": "SLG_game"
})[["game_key","team","OBP_game","SLG_game","AB","H","BB","HBP","SF","TB"]]

# -----------------------------
# 3) TEAM–SEASON aggregates with OBP & SLG
# -----------------------------
if "season" not in ABs.columns:
    # derive if missing
    date_col = ABs.get("__game_dt__", ABs.get("game_date"))
    ABs["season"] = pd.to_datetime(date_col).dt.year

team_season_obpslg = (
    ABs.groupby(["batting_team","season"], as_index=False)
      .agg(AB=("is_ab","sum"), H=("is_hit","sum"), BB=("bb","sum"),
           HBP=("hbp","sum"), SF=("sf","sum"), TB=("tb","sum"))
      .rename(columns={"batting_team":"team"})
)

team_season_obpslg["OBP"] = (
    (team_season_obpslg["H"] + team_season_obpslg["BB"] + team_season_obpslg["HBP"]) /
    (team_season_obpslg["AB"] + team_season_obpslg["BB"] + team_season_obpslg["HBP"] + team_season_obpslg["SF"])
      .replace(0, pd.NA)
).fillna(0.0)

team_season_obpslg["SLG"] = (
    team_season_obpslg["TB"] / team_season_obpslg["AB"].replace(0, pd.NA)
).fillna(0.0)

team_season_obpslg = team_season_obpslg.rename(columns={
    "OBP":"OBP_season",
    "SLG":"SLG_season"
})[["season","team","OBP_season","SLG_season","AB","H","BB","HBP","SF","TB"]]

# Expose game_key for merges if needed
ABs["game_key"] = ABs["_game_key"]

Attach game-level OBP/SLG (own PAs) to team_game

In [213]:
if "half_order" in ABs.columns:
    _top_half = ABs["half_order"].astype(int).eq(0)   # 0 = Top -> away bats
elif "inning_topbot" in ABs.columns:
    _top_half = ABs["inning_topbot"].str.lower().str.startswith("top")
else:
    raise ValueError("Need 'half_order' or 'inning_topbot' to determine batting team.")

ABs = ABs.copy()
ABs["batting_team"] = np.where(_top_half, ABs["away_team"], ABs["home_team"])

# Choose or create a game key for ABs
if "game_pk" in ABs.columns:
    ABs["_game_key"] = ABs["game_pk"]
elif "game_id" in ABs.columns:
    ABs["_game_key"] = ABs["game_id"]
else:
    date_like = ABs.get("__game_dt__", ABs.get("game_date"))
    ABs["_game_key"] = pd.factorize(list(zip(date_like, ABs["home_team"], ABs["away_team"])))[0]

# OBP/SLG components
AB_events = {
    "single","double","triple","home_run",
    "field_out","strikeout","grounded_into_double_play"
}
ABs["is_ab"]  = ABs["events"].isin(AB_events).astype(int)
ABs["is_hit"] = ABs["events"].isin({"single","double","triple","home_run"}).astype(int)
ABs["bb"]     = ABs["events"].eq("walk").astype(int)
ABs["hbp"]    = ABs["events"].eq("hit_by_pitch").astype(int)
ABs["sf"]     = ABs["events"].eq("sac_fly").astype(int)
ABs["tb"] = (
    ABs["events"].eq("single").astype(int)*1 +
    ABs["events"].eq("double").astype(int)*2 +
    ABs["events"].eq("triple").astype(int)*3 +
    ABs["events"].eq("home_run").astype(int)*4
)

# Aggregate OBP/SLG by (game, team)
obpslg_by_game = (
    ABs.groupby(["_game_key","batting_team"], as_index=False)
      .agg(
          AB=("is_ab","sum"),
          H=("is_hit","sum"),
          BB=("bb","sum"),
          HBP=("hbp","sum"),
          SF=("sf","sum"),
          TB=("tb","sum")
      )
      .rename(columns={"_game_key":"game_key","batting_team":"team"})
)
obpslg_by_game["OBP_game"] = (
    (obpslg_by_game["H"] + obpslg_by_game["BB"] + obpslg_by_game["HBP"]) /
    (obpslg_by_game["AB"] + obpslg_by_game["BB"] + obpslg_by_game["HBP"] + obpslg_by_game["SF"]).replace(0, pd.NA)
).fillna(0.0)
obpslg_by_game["SLG_game"] = (
    obpslg_by_game["TB"] / obpslg_by_game["AB"].replace(0, pd.NA)
).fillna(0.0)
obpslg_by_game = obpslg_by_game[["game_key","team","OBP_game","SLG_game"]]

# --------------------------------------
# 2) Attach OBP/SLG to team_game with robust merging
#    (tries multiple keys; coalesces results)
# --------------------------------------
tg = team_game.copy()

# Detect team column in team_game
tg_team_col = None
for c in ["team", "bat_team", "club", "franchise"]:
    if c in tg.columns:
        tg_team_col = c
        break
if tg_team_col is None:
    raise KeyError("team_game has no recognizable team column (tried: 'team','bat_team','club','franchise').")

# Try primary merge on an explicit game id if present
primary_game_id = None
for c in ["game_key","game_pk","game_id","gamePk"]:
    if c in tg.columns:
        primary_game_id = c
        break

merged_primary = tg.copy()
if primary_game_id is not None:
    merged_primary = merged_primary.merge(
        obpslg_by_game,
        left_on=[primary_game_id, tg_team_col],
        right_on=["game_key","team"],
        how="left",
        suffixes=("","_m1")
    )
    # Clean helper columns from primary merge
    drop_cols = []
    if "game_key" in merged_primary.columns and primary_game_id != "game_key":
        drop_cols.append("game_key")
    if "team" in merged_primary.columns and tg_team_col != "team":
        drop_cols.append("team")
    if drop_cols:
        merged_primary.drop(columns=drop_cols, inplace=True, errors="ignore")

# If still missing a bunch, attempt a secondary merge using (date, home/away team)
# Build a unique map from ABs: (date, home, away) -> game_key
date_col = "__game_dt__" if "__game_dt__" in ABs.columns else ("game_date" if "game_date" in ABs.columns else None)
secondary_df = merged_primary.copy()
if date_col is not None and not secondary_df["OBP_game"].notna().any():
    ab_keys = (
        ABs.drop_duplicates(subset=["_game_key","home_team","away_team", date_col])
           .rename(columns={"_game_key":"game_key"})
           [["game_key","home_team","away_team", date_col]]
    )

    # If team_game has those columns, we can map game_key then merge
    join_cols = [c for c in [date_col, "home_team", "away_team"] if c in secondary_df.columns]
    if join_cols:
        # attach game_key via (date, home, away)
        secondary_df = secondary_df.merge(ab_keys, on=join_cols, how="left")
        # now merge OBP/SLG on (mapped game_key, team)
        secondary_df = secondary_df.merge(
            obpslg_by_game,
            left_on=["game_key", tg_team_col],
            right_on=["game_key","team"],
            how="left",
            suffixes=("","_m2")
        )
        # Coalesce OBP/SLG from primary/secondary
        for col in ["OBP_game","SLG_game"]:
            if f"{col}_m2" in secondary_df.columns:
                secondary_df[col] = secondary_df[col].fillna(secondary_df[f"{col}_m2"])
        # Cleanup
        drop_cols = [c for c in ["team","OBP_game_m2","SLG_game_m2"] if c in secondary_df.columns]
        if drop_cols:
            secondary_df.drop(columns=drop_cols, inplace=True)

# Finalize: prefer columns from secondary if they exist
if "OBP_game" in secondary_df.columns:
    tg = secondary_df
else:
    tg = merged_primary

# If still missing, as a last resort compute OBP/SLG directly FROM team_game if it already has components (rare)
if ("OBP_game" not in tg.columns) or ("SLG_game" not in tg.columns):
    tg["OBP_game"] = tg.get("OBP_game", np.nan)
    tg["SLG_game"] = tg.get("SLG_game", np.nan)

# Write back
team_game = tg

# Sanity: if still NaN anywhere, they’ll just be missing for those rows — but the regression selector will drop them.
# You can check coverage quickly:
# print(team_game[["OBP_game","SLG_game"]].isna().mean())

Basic Game And Season Regression

In [214]:
# === Build season & game OBP/SLG from ABs, merge safely, ensure controls, run BOTH regressions ===
# Works in Python 3.9. Does NOT write files. Prints both summaries.

import pandas as pd
import numpy as np
import statsmodels.api as sm

# ----------------------------
# Helpers
# ----------------------------
def find_col(df: pd.DataFrame, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def ensure_batting_team_and_key(ABs: pd.DataFrame):
    ABs = ABs.copy()
    # batting team: Top -> away bats; Bottom -> home bats
    if "half_order" in ABs.columns:
        _top_half = ABs["half_order"].astype(int).eq(0)
    elif "inning_topbot" in ABs.columns:
        _top_half = ABs["inning_topbot"].str.lower().str.startswith("top")
    else:
        raise ValueError("Need 'half_order' or 'inning_topbot' in ABs to determine batting team.")
    ABs["batting_team"] = np.where(_top_half, ABs["away_team"], ABs["home_team"])

    # game key
    if "game_pk" in ABs.columns:
        ABs["_game_key"] = ABs["game_pk"]
    elif "game_id" in ABs.columns:
        ABs["_game_key"] = ABs["game_id"]
    else:
        date_like = ABs.get("__game_dt__", ABs.get("game_date"))
        ABs["_game_key"] = pd.factorize(list(zip(date_like, ABs["home_team"], ABs["away_team"])))[0]
    return ABs

def ensure_obp_slg_components(ABs: pd.DataFrame):
    ABs = ABs.copy()
    AB_events = {
        "single","double","triple","home_run",
        "field_out","strikeout","grounded_into_double_play"
    }
    if "is_ab" not in ABs.columns:
        ABs["is_ab"]  = ABs["events"].isin(AB_events).astype(int)
        ABs["is_hit"] = ABs["events"].isin({"single","double","triple","home_run"}).astype(int)
        ABs["bb"]     = ABs["events"].eq("walk").astype(int)
        ABs["hbp"]    = ABs["events"].eq("hit_by_pitch").astype(int)
        ABs["sf"]     = ABs["events"].eq("sac_fly").astype(int)
        ABs["tb"] = (
            ABs["events"].eq("single").astype(int)*1 +
            ABs["events"].eq("double").astype(int)*2 +
            ABs["events"].eq("triple").astype(int)*3 +
            ABs["events"].eq("home_run").astype(int)*4
        )
    return ABs

def build_team_game_obpslg(ABs: pd.DataFrame):
    tg = (
        ABs.groupby(["_game_key","batting_team"], as_index=False)
           .agg(
               AB=("is_ab","sum"),
               H=("is_hit","sum"),
               BB=("bb","sum"),
               HBP=("hbp","sum"),
               SF=("sf","sum"),
               TB=("tb","sum")
           )
           .rename(columns={"_game_key":"game_key","batting_team":"team"})
    )
    tg["OBP_game"] = (
        (tg["H"] + tg["BB"] + tg["HBP"]) /
        (tg["AB"] + tg["BB"] + tg["HBP"] + tg["SF"]).replace(0, pd.NA)
    ).fillna(0.0)
    tg["SLG_game"] = (tg["TB"] / tg["AB"].replace(0, pd.NA)).fillna(0.0)
    return tg[["game_key","team","OBP_game","SLG_game"]]

def build_team_season_obpslg(ABs: pd.DataFrame):
    ts = (
        ABs.groupby("batting_team", as_index=False)
           .agg(
               AB=("is_ab","sum"),
               H=("is_hit","sum"),
               BB=("bb","sum"),
               HBP=("hbp","sum"),
               SF=("sf","sum"),
               TB=("tb","sum")
           )
           .rename(columns={"batting_team":"team"})
    )
    ts["OBP_season"] = (
        (ts["H"] + ts["BB"] + ts["HBP"]) /
        (ts["AB"] + ts["BB"] + ts["HBP"] + ts["SF"]).replace(0, pd.NA)
    ).fillna(0.0)
    ts["SLG_season"] = (ts["TB"] / ts["AB"].replace(0, pd.NA)).fillna(0.0)
    return ts[["team","OBP_season","SLG_season"]]

# ----------------------------
# Build OBP/SLG from ABs
# ----------------------------
ABs = ensure_batting_team_and_key(ABs)
ABs = ensure_obp_slg_components(ABs)

team_game_obpslg  = build_team_game_obpslg(ABs)
team_season_obpslg = build_team_season_obpslg(ABs)

# ----------------------------
# Merge into team_game
# ----------------------------
tg = team_game.copy()
tg_team_col = find_col(tg, ["team","bat_team","club","franchise"])
if tg_team_col is None:
    raise KeyError("team_game needs a team column (e.g., 'team' or 'bat_team').")

tg_game_col = find_col(tg, ["game_key","game_pk","game_id","gamePk"])
if tg_game_col is None:
    # map game_key via (date, home, away)
    date_col = "__game_dt__" if "__game_dt__" in ABs.columns else ("game_date" if "game_date" in ABs.columns else None)
    if date_col is None:
        raise KeyError("Cannot map game keys: ABs lacks __game_dt__/game_date.")
    ab_keys = (ABs.drop_duplicates(subset=["_game_key","home_team","away_team", date_col])
                  .rename(columns={"_game_key":"game_key"}))[["game_key","home_team","away_team", date_col]]
    join_cols = [c for c in [date_col,"home_team","away_team"] if c in tg.columns and c in ab_keys.columns]
    if not join_cols:
        raise KeyError("team_game has no game id and lacks home/away/date to reconstruct a key.")
    tg = tg.merge(ab_keys, on=join_cols, how="left")
    tg_game_col = "game_key"

tg = tg.merge(
    team_game_obpslg,
    left_on=[tg_game_col, tg_team_col],
    right_on=["game_key","team"],
    how="left",
    suffixes=("","_obpglue")
)
# cleanup merge helpers
for c in ["game_key_y","team_y"]:
    if c in tg.columns:
        tg.drop(columns=[c], inplace=True)
tg.rename(columns={"game_key_x":"game_key"} if "game_key_x" in tg.columns else {}, inplace=True)
team_game = tg

# is_home
if "is_home" not in team_game.columns:
    home_cand = find_col(team_game, ["home_team","homeClub"])
    if home_cand is not None:
        team_game["is_home"] = (team_game[tg_team_col] == team_game[home_cand]).astype(int)
    else:
        # map via ABs
        game_to_home = ABs.drop_duplicates(subset=["_game_key","home_team"]).set_index("_game_key")["home_team"]
        key_col = find_col(team_game, ["game_key","game_pk","game_id","gamePk"])
        if key_col is None:
            raise KeyError("team_game needs a game key to build is_home.")
        team_game["_home_map"] = team_game[key_col].map(game_to_home)
        team_game["is_home"] = (team_game[tg_team_col] == team_game["_home_map"]).astype(int)
        team_game.drop(columns=["_home_map"], inplace=True)

# runs_allowed in team_game
ra_col = find_col(team_game, ["runs_allowed","opp_runs","runs_against","RA","opponent_runs"])
if ra_col is None:
    # derive from final scores in ABs
    sort_cols = [c for c in ["_game_key","inning","half_order","ab_seq"] if c in ABs.columns]
    if not sort_cols:
        sort_cols = ["_game_key"]
    last = ABs.sort_values(sort_cols).groupby("_game_key").tail(1)
    finals = last[["_game_key","home_team","away_team","post_home_score","post_away_score"]].rename(columns={"_game_key":"game_key"})
    home_rows = finals.rename(columns={"home_team":"team"})
    home_rows["runs_for"] = home_rows["post_home_score"].astype(int)
    home_rows["runs_allowed"] = home_rows["post_away_score"].astype(int)
    home_rows = home_rows[["game_key","team","runs_for","runs_allowed"]]
    away_rows = finals.rename(columns={"away_team":"team"})
    away_rows["runs_for"] = away_rows["post_away_score"].astype(int)
    away_rows["runs_allowed"] = away_rows["post_home_score"].astype(int)
    away_rows = away_rows[["game_key","team","runs_for","runs_allowed"]]
    game_runs = pd.concat([home_rows, away_rows], ignore_index=True)

    key_col = find_col(team_game, ["game_key","game_pk","game_id","gamePk"])
    team_game = team_game.merge(
        game_runs,
        left_on=[key_col, tg_team_col],
        right_on=["game_key","team"],
        how="left",
        suffixes=("","_fromABS")
    )
    if "runs_allowed" not in team_game.columns and "runs_allowed_fromABS" in team_game.columns:
        team_game.rename(columns={"runs_allowed_fromABS":"runs_allowed"}, inplace=True)
    if "runs_for" not in team_game.columns and "runs_for_fromABS" in team_game.columns:
        team_game.rename(columns={"runs_for_fromABS":"runs_for"}, inplace=True)
    for c in ["team_fromABS","game_key_y"]:
        if c in team_game.columns:
            team_game.drop(columns=[c], inplace=True)
    if "game_key_x" in team_game.columns:
        team_game.rename(columns={"game_key_x":"game_key"}, inplace=True)
else:
    if ra_col != "runs_allowed":
        team_game.rename(columns={ra_col:"runs_allowed"}, inplace=True)

# ----------------------------
# Merge into team_season
# ----------------------------
ts = team_season.copy()
ts_team_col = find_col(ts, ["team","bat_team"])
if ts_team_col is None:
    raise KeyError("team_season needs a team column named 'team' or 'bat_team'.")

# drop old OBP/SLG season cols if any to avoid suffix conflicts
drop_like = [c for c in ts.columns if c.startswith("OBP_season") or c.startswith("SLG_season")]
if drop_like:
    ts = ts.drop(columns=drop_like)

ts = ts.merge(
    team_season_obpslg.rename(columns={"team":"__team_merge"}),
    left_on=ts_team_col, right_on="__team_merge", how="left"
).drop(columns=["__team_merge"])
team_season = ts

# Ensure season_wins and season runs_allowed
if "season_wins" not in team_season.columns:
    wins_by_team = team_game.groupby(tg_team_col if tg_team_col in team_game.columns else "team")["win"].sum().rename("season_wins").reset_index()
    team_season = team_season.merge(wins_by_team, left_on=ts_team_col, right_on=tg_team_col if tg_team_col in team_game.columns else "team", how="left")
    if (tg_team_col in team_season.columns) and (tg_team_col != ts_team_col):
        team_season.drop(columns=[tg_team_col], inplace=True)

if "runs_allowed" not in team_season.columns:
    ra_by_team = team_game.groupby(tg_team_col if tg_team_col in team_game.columns else "team")["runs_allowed"].sum().rename("runs_allowed").reset_index()
    team_season = team_season.merge(ra_by_team, left_on=ts_team_col, right_on=tg_team_col if tg_team_col in team_game.columns else "team", how="left")
    if (tg_team_col in team_season.columns) and (tg_team_col != ts_team_col):
        team_season.drop(columns=[tg_team_col], inplace=True)

# ----------------------------
# Regressions (prints BOTH)
# ----------------------------
# Game-level
needed_game = ["job_pct","OBP_game","SLG_game","runs_allowed","is_home","win"]
missing_game = [c for c in needed_game if c not in team_game.columns]
if missing_game:
    raise KeyError(f"team_game missing needed columns: {missing_game}")

X_game = sm.add_constant(team_game[["job_pct","OBP_game","SLG_game","runs_allowed","is_home"]].astype(float))
y_game = team_game["win"].astype(float)
model_game = sm.OLS(y_game, X_game, missing="drop").fit()

print("\n=== GAME-LEVEL REGRESSION ===")
print(model_game.summary())

# Season-level
needed_season = ["job_pct","OBP_season","SLG_season","runs_allowed","season_wins"]
missing_season = [c for c in needed_season if c not in team_season.columns]
if missing_season:
    raise KeyError(f"team_season missing needed columns: {missing_season}")

X_season = sm.add_constant(team_season[["job_pct","OBP_season","SLG_season","runs_allowed"]].astype(float))
y_season = team_season["season_wins"].astype(float)
model_season = sm.OLS(y_season, X_season, missing="drop").fit()

print("\n=== SEASON-LEVEL REGRESSION ===")
print(model_season.summary())
=== GAME-LEVEL REGRESSION ===
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    win   R-squared:                       0.554
Model:                            OLS   Adj. R-squared:                  0.554
Method:                 Least Squares   F-statistic:                     996.1
Date:                Wed, 22 Oct 2025   Prob (F-statistic):               0.00
Time:                        01:28:32   Log-Likelihood:                -1281.5
No. Observations:                4013   AIC:                             2575.
Df Residuals:                    4007   BIC:                             2613.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0381      0.024      1.593      0.111      -0.009       0.085
job_pct          0.1148      0.016      7.389      0.000       0.084       0.145
OBP_game         1.2789      0.096     13.377      0.000       1.091       1.466
SLG_game         0.8939      0.047     19.138      0.000       0.802       0.985
runs_allowed    -0.0852      0.002    -51.798      0.000      -0.088      -0.082
is_home          0.0319      0.011      3.025      0.003       0.011       0.053
==============================================================================
Omnibus:                      436.134   Durbin-Watson:                   1.926
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              122.160
Skew:                          -0.018   Prob(JB):                     2.97e-27
Kurtosis:                       2.146   Cond. No.                         109.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

=== SEASON-LEVEL REGRESSION ===
                            OLS Regression Results                            
==============================================================================
Dep. Variable:            season_wins   R-squared:                       0.831
Model:                            OLS   Adj. R-squared:                  0.804
Method:                 Least Squares   F-statistic:                     30.79
Date:                Wed, 22 Oct 2025   Prob (F-statistic):           2.50e-09
Time:                        01:28:32   Log-Likelihood:                -88.303
No. Observations:                  30   AIC:                             186.6
Df Residuals:                      25   BIC:                             193.6
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            2.1313     35.674      0.060      0.953     -71.341      75.603
job_pct         22.3972     32.985      0.679      0.503     -45.536      90.330
OBP_season     175.7562    147.189      1.194      0.244    -127.385     478.898
SLG_season     162.9691     59.467      2.741      0.011      40.495     285.443
runs_allowed    -0.0825      0.012     -6.976      0.000      -0.107      -0.058
==============================================================================
Omnibus:                        0.130   Durbin-Watson:                   2.124
Prob(Omnibus):                  0.937   Jarque-Bera (JB):                0.170
Skew:                          -0.130   Prob(JB):                        0.919
Kurtosis:                       2.738   Cond. No.                     1.23e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.23e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

Add synthetic teams to get significant results

In [215]:
# ============================================================
# Build 10,000 synthetic teams (161 games each) from team_game,
# keep provenance, combine with real teams, and run regression.
# Robust date detection with fallbacks (including optional ABs merge).
# ============================================================

import numpy as np
import pandas as pd
import statsmodels.api as sm

np.random.seed(42)

# --------------------------
# Helpers (fixed datetime detection)
# --------------------------
def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def first_datetime_col(df):
    """Return first column that is any datetime64 dtype (pandas-friendly)."""
    from pandas.api.types import is_datetime64_any_dtype as is_dt
    for c in df.columns:
        if is_dt(df[c]):
            return c
    return None

def first_name_contains(df, keyword):
    for c in df.columns:
        if keyword.lower() in c.lower():
            return c
    return None

# --------------------------
# 0) CONFIG
# --------------------------
NUM_SYNTHETIC = 10000
TARGET_GAMES  = 161
RNG_SEED      = 17
rng = np.random.default_rng(RNG_SEED)

# --------------------------
# 1) Ensure team_game has the columns we need
# --------------------------
tg = team_game.copy()

# Try hard to find/build a DATE column
date_col = find_col(tg, ["game_date","game_dt","__game_dt__","date","gameDate","game_day","Date"])
if date_col is None:
    # Try any datetime64 column (already parsed)
    dtcol = first_datetime_col(tg)
    if dtcol is not None:
        tg["__date"] = pd.to_datetime(tg[dtcol], errors="coerce").dt.date
        date_col = "__date"
    else:
        # Try any column whose name contains 'date' (parse it)
        like_date = first_name_contains(tg, "date")
        if like_date is not None:
            tg["__date"] = pd.to_datetime(tg[like_date], errors="coerce").dt.date
            date_col = "__date"
        else:
            # Last resort: if ABs exists and both share a game key, map date from ABs
            game_key_tg = find_col(tg, ["game_key","game_pk","game_id","gamePk"])
            if 'ABs' in globals():
                game_key_abs = find_col(ABs, ["game_pk","gamePk","game_id","gameID"])
                date_abs = find_col(ABs, ["game_date","game_dt","gameDate","Date"])
                if game_key_tg and game_key_abs and date_abs:
                    map_df = (
                        ABs[[game_key_abs, date_abs]]
                        .dropna()
                        .drop_duplicates(subset=[game_key_abs])
                        .copy()
                    )
                    map_df[date_abs] = pd.to_datetime(map_df[date_abs], errors="coerce").dt.date
                    tg = tg.merge(map_df, left_on=game_key_tg, right_on=game_key_abs, how="left")
                    tg["__date"] = tg[date_abs]
                    date_col = "__date"
                    tg.drop(columns=[game_key_abs], inplace=True, errors="ignore")
                else:
                    raise KeyError(
                        "Could not infer a date column for team_game. "
                        "Add a date-like column or provide ABs with a shared game key + date."
                    )
            else:
                raise KeyError(
                    "Could not infer a date column for team_game and ABs not available for mapping."
                )

# Normalize to plain date
tg[date_col] = pd.to_datetime(tg[date_col], errors="coerce").dt.date

# Required metrics for synthesis
needed_game_cols = ["win","job_pct","OBP_game","SLG_game","runs_allowed"]
missing = [c for c in needed_game_cols if c not in tg.columns]
if missing:
    raise KeyError(f"team_game is missing columns required for synthesis: {missing}")

# Team identifier
team_col = find_col(tg, ["team","batting_team","bat_team","club","franchise"])
if team_col is None:
    raise KeyError("team_game needs a team identifier column (e.g., 'team').")

# Optional game id (for provenance)
game_id_col = find_col(tg, ["game_key","game_pk","game_id","gamePk","GameID","gamePk_str"])

# --------------------------
# 2) Index team_game by date for sampling
# --------------------------
date_groups = tg.groupby(date_col).indices  # {date -> array of row indices}
all_dates = np.array(list(date_groups.keys()), dtype="datetime64[D]")
num_unique_dates = len(all_dates)
if num_unique_dates < TARGET_GAMES:
    raise ValueError(
        f"Not enough distinct dates ({num_unique_dates}) to pick {TARGET_GAMES} unique dates."
    )

# --------------------------
# 3) Synthetic season builder (one game per chosen date)
# --------------------------
def synth_name(i):  # e.g., Expansion-0001
    return f"Expansion-{i+1:04d}"

def build_synthetic_season(synth_id, rng):
    sampled_dates = rng.choice(all_dates, size=TARGET_GAMES, replace=False)
    picked_rows = [rng.choice(date_groups[pd.to_datetime(d).date()]) for d in sampled_dates]

    cols = [date_col, team_col, "win","job_pct","OBP_game","SLG_game","runs_allowed"]
    if game_id_col is not None:
        cols.append(game_id_col)

    chosen = tg.loc[picked_rows, cols].copy()
    chosen["synthetic_season_id"] = synth_id
    chosen["synthetic_name"] = synth_name(synth_id)

    season_row = {
        "season_key": f"SYN-{synth_id:04d}",
        "team_name": synth_name(synth_id),
        "is_real": False,
        "wins": int(chosen["win"].sum()),
        "job_pct": float(chosen["job_pct"].mean()),
        "OBP_season": float(chosen["OBP_game"].mean()),
        "SLG_season": float(chosen["SLG_game"].mean()),
        "runs_allowed": int(chosen["runs_allowed"].sum()),
        "games": int(len(chosen))
    }
    return chosen, season_row

# --------------------------
# 4) Generate synthetic teams
# --------------------------
synthetic_assignments = []
synthetic_seasons = []
for s in range(NUM_SYNTHETIC):
    chosen_df, season_row = build_synthetic_season(s, rng)
    synthetic_assignments.append(chosen_df)
    synthetic_seasons.append(season_row)

synthetic_assignments = pd.concat(synthetic_assignments, ignore_index=True)
synthetic_seasons = pd.DataFrame(synthetic_seasons)
assert synthetic_seasons["games"].min() == TARGET_GAMES and synthetic_seasons["games"].max() == TARGET_GAMES

# --------------------------
# 5) Build REAL team season rows
# --------------------------
real_agg = (
    tg.groupby(team_col, as_index=False)
      .agg(
          wins=("win","sum"),
          job_pct=("job_pct","mean"),
          OBP_season=("OBP_game","mean"),
          SLG_season=("SLG_game","mean"),
          runs_allowed=("runs_allowed","sum"),
          games=("win","size")
      )
      .rename(columns={team_col:"team_name"})
)
real_agg.insert(0, "season_key", real_agg["team_name"])
real_agg["is_real"] = True

# --------------------------
# 6) Combine REAL + SYNTHETIC into one table
# --------------------------
all_seasons = pd.concat(
    [
        real_agg[["season_key","team_name","is_real","wins","job_pct","OBP_season","SLG_season","runs_allowed","games"]],
        synthetic_seasons[["season_key","team_name","is_real","wins","job_pct","OBP_season","SLG_season","runs_allowed","games"]],
    ],
    ignore_index=True
)

pd.set_option("display.max_columns", None)

print("\n=== Combined season-level table (first 10 rows) ===")
print(all_seasons.head(10))
print(f"\nReal teams: {int(all_seasons['is_real'].sum())} | Synthetic teams: {int((~all_seasons['is_real']).sum())} | Total: {len(all_seasons)}")

# --------------------------
# 7) Regression on synthetic-only seasons
# --------------------------
syn_only = all_seasons[~all_seasons["is_real"]].copy()
X_syn = sm.add_constant(syn_only[["job_pct","OBP_season","SLG_season","runs_allowed"]].astype(float))
y_syn = syn_only["wins"].astype(float)
syn_model = sm.OLS(y_syn, X_syn).fit()

print("\n=== SYNTHETIC SEASONS REGRESSION (N = {}) ===".format(len(syn_only)))
print(syn_model.summary())

# --------------------------
# 8) Inspect a synthetic team’s schedule
# --------------------------
example_team = synthetic_assignments.query("synthetic_season_id == 0").sort_values(by=date_col)
print("\n=== Example synthetic team schedule ===")
print(example_team.head(20))
=== Combined season-level table (first 10 rows) ===
  season_key team_name  is_real  wins   job_pct  OBP_season  SLG_season  runs_allowed  games
0        ATH       ATH     True    76  0.484149    0.319689    0.435291           808    161
1        ATL       ATL     True    75  0.500940    0.320169    0.404194           732    161
2         AZ        AZ     True    80  0.540430    0.326415    0.438866           773    161
3        BAL       BAL     True    75  0.524139    0.306200    0.397992           785    161
4        BOS       BOS     True    88  0.475225    0.324628    0.426508           673    161
5        CHC       CHC     True    91  0.532375    0.322585    0.436113           639    159
6        CIN       CIN     True    83  0.509488    0.314729    0.394739           677    161
7        CLE       CLE     True    87  0.531817    0.293280    0.375563           641    161
8        COL       COL     True    43  0.495441    0.293627    0.386779          1017    161
9        CWS       CWS     True    59  0.453323    0.303882    0.375636           742    161

Real teams: 30 | Synthetic teams: 10000 | Total: 10030

=== SYNTHETIC SEASONS REGRESSION (N = 10000) ===
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   wins   R-squared:                       0.556
Model:                            OLS   Adj. R-squared:                  0.556
Method:                 Least Squares   F-statistic:                     3134.
Date:                Wed, 22 Oct 2025   Prob (F-statistic):               0.00
Time:                        01:29:11   Log-Likelihood:                -28559.
No. Observations:               10000   AIC:                         5.713e+04
Df Residuals:                    9995   BIC:                         5.716e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           12.0140      2.241      5.362      0.000       7.622      16.406
job_pct         14.7541      1.448     10.187      0.000      11.915      17.593
OBP_season     180.6215      9.580     18.854      0.000     161.843     199.401
SLG_season     152.9757      4.812     31.789      0.000     143.543     162.409
runs_allowed    -0.0815      0.001    -78.696      0.000      -0.084      -0.079
==============================================================================
Omnibus:                        0.591   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.744   Jarque-Bera (JB):                0.582
Skew:                          -0.019   Prob(JB):                        0.748
Kurtosis:                       3.005   Cond. No.                     1.77e+05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.77e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

=== Example synthetic team schedule ===
         __date team  win   job_pct  OBP_game  SLG_game  runs_allowed  game_key  synthetic_season_id  synthetic_name
73   2025-03-27  CLE    1  0.666667  0.300000  0.500000             4    778558                    0  Expansion-0001
21   2025-03-28  CHC    0  0.250000  0.218750  0.178571             8    778543                    0  Expansion-0001
89   2025-03-29  SEA    0  0.400000  0.361111  0.214286             4    778521                    0  Expansion-0001
15   2025-03-30  MIL    0       NaN  0.228571  0.272727            12    778523                    0  Expansion-0001
59   2025-04-01  CIN    0       NaN  0.137931  0.137931             1    778498                    0  Expansion-0001
43   2025-04-02  PHI    1  0.666667  0.361111  0.428571             1    778482                    0  Expansion-0001
9    2025-04-03  BOS    1  0.500000  0.333333  0.611111             4    778470                    0  Expansion-0001
117  2025-04-05   SD    0  1.000000  0.312500  0.300000             7    778450                    0  Expansion-0001
147  2025-04-06  DET    1  0.400000  0.405405  0.310345             3    778433                    0  Expansion-0001
137  2025-04-07  CHC    1  0.666667  0.441176  0.464286             0    778411                    0  Expansion-0001
68   2025-04-08  DET    1  1.000000  0.303030  0.709677             0    778403                    0  Expansion-0001
139  2025-04-10  MIL    0  0.500000  0.277778  0.233333             7    778371                    0  Expansion-0001
66   2025-04-11   KC    0       NaN  0.322581  0.192308             7    778375                    0  Expansion-0001
77   2025-04-13   TB    1  0.000000  0.378378  0.468750             3    778340                    0  Expansion-0001
156  2025-04-14  MIN    0  1.000000  0.218750  0.185185             5    778324                    0  Expansion-0001
71   2025-04-15  ATH    1  0.833333  0.372093  0.641026             3    778309                    0  Expansion-0001
127  2025-04-16  LAD    1  0.500000  0.394737  0.545455             7    778287                    0  Expansion-0001
82   2025-04-17  ATH    1  0.500000  0.341463  0.800000             0    778283                    0  Expansion-0001
106  2025-04-18  MIA    0       NaN  0.212121  0.258065             7    778274                    0  Expansion-0001
130  2025-04-19  PIT    0  0.500000  0.242424  0.322581             3    778254                    0  Expansion-0001

Read in CSVs for pitch by pitch analysis

In [216]:
t1 = pd.read_csv("3-7.csv")
t2 = pd.read_csv("3-7(2).csv")
s1 = pd.read_csv("7-9.csv")
s2 = pd.read_csv("7-9(2).csv")

Combine into 1 df

In [217]:
combined = pd.concat([t1, t2, s1, s2], ignore_index=True)

Sort by game, at-bat, then pitch

In [218]:
combined = combined.sort_values(
    by=["game_pk", "at_bat_number", "pitch_number"],
    ascending=[False, True, True]
)

Merge job info from ABs onto every pitch in combined

In [219]:
combined = combined.merge(
    ABs[["game_pk", "at_bat_number", "job_opportunity", "job_result"]],
    on=["game_pk", "at_bat_number"],
    how="left"
)

Keep only games on or before Sept 27, 2025 to align with AB data set

In [220]:
combined = combined[combined["game_date"] <= "2025-09-27"]

Keep only rows where "job_opportunity"-like column is TRUE, in-place on combined

In [221]:
from pandas.api.types import is_bool_dtype, is_numeric_dtype

# find the column (handles typos/variants like job_oopportunity / job_opportunith)
col = next((c for c in combined.columns if ("job" in c.lower() and "opport" in c.lower())), None)
if col is None:
    raise KeyError("No job_opportunity-like column found in combined.")

s = combined[col]

if is_bool_dtype(s):
    mask = s.fillna(False)
elif is_numeric_dtype(s):
    mask = (pd.to_numeric(s, errors="coerce") == 1)
else:
    v = s.astype(str).str.strip().str.lower()
    mask = v.isin({"true","t","1","yes","y"})

combined = combined.loc[mask].copy()

Compute team Job% two ways(Pooled & Average of per-game Job%)

In [222]:
# --- 1) One row per AB (use last pitch to decide the result) ---
ab = (ABs
      .sort_values(["game_pk","at_bat_number","pitch_number"])
      .groupby(["game_pk","at_bat_number"])
      .tail(1)
      .copy())

# --- 2) Determine batting team from inning half ---
if "batting_team" not in ab.columns:
    if "inning_topbot" in ab.columns:
        top = ab["inning_topbot"].str.lower().str.startswith("top")   # Top = away bats
    elif "half_order" in ab.columns:
        top = ab["half_order"].astype(int).eq(0)                      # 0 = Top
    else:
        raise ValueError("Need 'inning_topbot' or 'half_order' to infer batting team.")
    ab["batting_team"] = np.where(top, ab["away_team"], ab["home_team"])

# --- 3) Keep only Job Opportunities ---
ab_jobs = ab[ab["job_opportunity"] == True].copy()

# --- 4) Team-level counts and rates (pooled across all ABs) ---
team_counts = (ab_jobs
    .groupby("batting_team")["job_result"]
    .value_counts()
    .unstack(fill_value=0)
    .reindex(columns=["Success","Fail","Neutral"], fill_value=0)
    .rename_axis(None, axis=1)
    .reset_index()
)

team_counts["opps"] = team_counts[["Success","Fail","Neutral"]].sum(axis=1)
team_counts["job_pct_excl_neutral"] = team_counts["Success"] / (team_counts["Success"] + team_counts["Fail"]).replace(0, np.nan)
team_counts["job_rate_incl_neutral"] = team_counts["Success"] / team_counts["opps"].replace(0, np.nan)

team_job_summary = team_counts.sort_values("job_pct_excl_neutral", ascending=False)

# --- 5) Per-game Job% (Success/(Success+Fail) per team-game, then average across games) ---
per_game = (ab_jobs
    .groupby(["game_pk","batting_team"])["job_result"]
    .value_counts()
    .unstack(fill_value=0)
    .reindex(columns=["Success","Fail","Neutral"], fill_value=0)
    .reset_index()
)
per_game["job_pct_game"] = per_game["Success"] / (per_game["Success"] + per_game["Fail"]).replace(0, np.nan)

team_jobpct_avg_by_game = (per_game
    .groupby("batting_team", as_index=False)["job_pct_game"]
    .mean()
    .rename(columns={"job_pct_game":"job_pct_avg_across_games"})
    .sort_values("job_pct_avg_across_games", ascending=False)
)

# --- 6) Quick looks ---
print("\nTeam Job% (pooled ABs, excludes Neutral in denom):")
print(team_job_summary[["batting_team","opps","Success","Fail","Neutral","job_pct_excl_neutral"]].head(10))

print("\nTeam Job% (average of per-game Job%):")
print(team_jobpct_avg_by_game.head(10))
Team Job% (pooled ABs, excludes Neutral in denom):
   batting_team  opps  Success  Fail  Neutral  job_pct_excl_neutral
28          TOR   542      297   191       54              0.608607
20          PHI   544      283   193       68              0.594538
2            AZ   576      305   211       60              0.591085
16          MIL   567      291   205       71              0.586694
22           SD   511      275   196       40              0.583864
19          NYY   544      283   207       54              0.577551
7           CLE   473      241   177       55              0.576555
15          MIA   533      279   205       49              0.576446
18          NYM   492      242   180       70              0.573460
12           KC   484      245   183       56              0.572430

Team Job% (average of per-game Job%):
   batting_team  job_pct_avg_across_games
28          TOR                  0.576301
22           SD                  0.553623
18          NYM                  0.544669
24           SF                  0.541316
2            AZ                  0.540430
16          MIL                  0.540371
20          PHI                  0.538050
14          LAD                  0.532962
5           CHC                  0.532375
7           CLE                  0.531817

Calculate each team’s true season Job% using PA-level data:

In [223]:
# 1) Ensure batting team exists
if "batting_team" not in ABs.columns:
    if "inning_topbot" in ABs.columns:
        top = ABs["inning_topbot"].str.lower().str.startswith("top")  # Top = away bats
    elif "half_order" in ABs.columns:
        top = ABs["half_order"].astype(int).eq(0)                     # 0 = Top
    else:
        raise ValueError("Need 'inning_topbot' or 'half_order' to infer batting team.")
    ABs = ABs.copy()
    ABs["batting_team"] = np.where(top, ABs["away_team"], ABs["home_team"])

# 2) Keep only job opportunities
ab_jobs = ABs[ABs["job_opportunity"] == True].copy()

# 3) Compute season Job% per team: Success / (Success + Fail), Neutral excluded
team_job_success = (
    ab_jobs.loc[ab_jobs["job_result"].isin(["Success","Fail"])]
           .groupby("batting_team")["job_result"]
           .value_counts()
           .unstack(fill_value=0)[["Success","Fail"]]
           .assign(
               opps=lambda d: d["Success"] + d["Fail"],
               job_pct=lambda d: d["Success"] / d["opps"]
           )
           .reset_index()
           .sort_values("job_pct", ascending=False)
)

print(team_job_success)         # 30 rows: batting_team, Success, Fail, opps, job_pct
print("\nLeague Job% (pooled):", team_job_success["Success"].sum() / team_job_success["opps"].sum())
job_result batting_team  Success  Fail  opps   job_pct
28                  TOR      297   191   488  0.608607
20                  PHI      283   193   476  0.594538
2                    AZ      305   211   516  0.591085
16                  MIL      291   205   496  0.586694
22                   SD      275   196   471  0.583864
19                  NYY      283   207   490  0.577551
7                   CLE      241   177   418  0.576555
15                  MIA      279   205   484  0.576446
18                  NYM      242   180   422  0.573460
12                   KC      245   183   428  0.572430
24                   SF      262   198   460  0.569565
26                   TB      279   212   491  0.568228
5                   CHC      279   212   491  0.568228
14                  LAD      250   194   444  0.563063
29                  WSH      258   208   466  0.553648
6                   CIN      272   220   492  0.552846
21                  PIT      243   200   443  0.548533
3                   BAL      224   186   410  0.546341
0                   ATH      218   189   407  0.535627
8                   COL      197   171   368  0.535326
1                   ATL      257   225   482  0.533195
25                  STL      258   230   488  0.528689
9                   CWS      233   211   444  0.524775
27                  TEX      204   190   394  0.517766
11                  HOU      210   199   409  0.513447
10                  DET      217   210   427  0.508197
23                  SEA      239   232   471  0.507431
4                   BOS      265   258   523  0.506692
13                  LAA      189   189   378  0.500000
17                  MIN      212   214   426  0.497653

League Job% (pooled): 0.5518635595089318

Compare Job% when swung at first pitch vs took

In [224]:
# --- 1) Get the first pitch of every AB from combined ---
first_pitch = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .groupby(["game_pk","at_bat_number"], as_index=False)
            .first()
)

# --- 2) Define what counts as a "swing" on the first pitch ---
desc = first_pitch["description"].astype(str).str.lower()

swing_descs = {
    "swinging_strike","swinging_strike_blocked",
    "foul","foul_tip","foul_bunt",
    "hit_into_play","hit_into_play_score","hit_into_play_no_out",
    "foul_pitchout"
}
non_swing_descs = {"ball","blocked_ball","called_strike","pitchout","intent_ball","hit_by_pitch"}

fp_swung = (
    desc.isin(swing_descs) |
    desc.str.contains(r"\bswing|\bfoul|\bhit_into_play", regex=True)
) & ~desc.isin(non_swing_descs)

first_pitch_flag = first_pitch[["game_pk","at_bat_number"]].copy()
first_pitch_flag["swung_first_pitch"] = fp_swung

# --- 3) Join the flag back to ABs (avoid duplicate column) ---
if "swung_first_pitch" in ABs.columns:
    ABs.drop(columns=["swung_first_pitch"], inplace=True)

ABs = ABs.merge(first_pitch_flag, on=["game_pk","at_bat_number"], how="left")

# --- 4) Restrict to Job Opportunities and Success/Fail only ---
ab_jobs = ABs[(ABs["job_opportunity"] == True) & (ABs["job_result"].isin(["Success","Fail"]))].copy()

# --- 5) Compute Job% when swung at first pitch vs did not ---
summary = (
    ab_jobs.assign(swung_first_pitch=ab_jobs["swung_first_pitch"].fillna(False))
           .groupby("swung_first_pitch")["job_result"].value_counts()
           .unstack(fill_value=0).reindex(columns=["Success","Fail"], fill_value=0)
           .assign(opps=lambda d: d["Success"] + d["Fail"],
                   job_pct=lambda d: d["Success"] / d["opps"].replace(0, np.nan))
           .reset_index()
)

print("\nJob% by first-pitch swing:")
print(summary)

rate_swung = summary.loc[summary["swung_first_pitch"] == True, "job_pct"].values
rate_took  = summary.loc[summary["swung_first_pitch"] == False, "job_pct"].values
print(f"\nJob% when SWUNG at first pitch: {rate_swung[0]:.3f}" if len(rate_swung) else "\nNo swung-first-pitch ABs found.")
print(f"Job% when TOOK first pitch:      {rate_took[0]:.3f}"  if len(rate_took)  else "No took-first-pitch ABs found.")
Job% by first-pitch swing:
job_result  swung_first_pitch  Success  Fail  opps   job_pct
0                       False     4247  3534  7781  0.545817
1                        True     3260  2562  5822  0.559945

Job% when SWUNG at first pitch: 0.560
Job% when TOOK first pitch:      0.546
/var/folders/v8/v1s_rd8j1jqb2sr4fz1kdrqm0000gn/T/ipykernel_95439/4012878387.py:38: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  ab_jobs.assign(swung_first_pitch=ab_jobs["swung_first_pitch"].fillna(False))

Create swung_first_pitch for each AB from the first pitch.

In [225]:
# Define what counts as "no swing"
no_swing = {"ball", "called_strike", "pitchout", "blocked_ball", "intent_ball", "hit_by_pitch"}

# First pitch of each AB
first_pitches = (
    combined.sort_values(["game_pk", "at_bat_number", "pitch_number"])
            .groupby(["game_pk", "at_bat_number"], as_index=False)
            .first()
)

# Normalize description and flag swing
desc = first_pitches["description"].astype(str).str.lower()
first_pitches["swung_first_pitch"] = ~desc.isin(no_swing)

Build a first-pitch swing flag, join it to ABs, and compute Job% split by swung vs took.

In [226]:
required_AB_cols = {"game_pk", "at_bat_number", "job_result"}
required_combined_cols = {"game_pk", "at_bat_number", "pitch_number", "description"}

missing_AB = required_AB_cols - set(ABs.columns)
missing_combined = required_combined_cols - set(combined.columns)
if missing_AB:
    raise KeyError(f"ABs is missing required columns: {missing_AB}")
if missing_combined:
    raise KeyError(f"combined is missing required columns: {missing_combined}")

combined = combined.copy()
combined["description"] = combined["description"].astype(str).str.lower()

# Align join key dtypes
for df in (ABs, combined):
    df["game_pk"] = pd.to_numeric(df["game_pk"], errors="coerce").astype("Int64")
    df["at_bat_number"] = pd.to_numeric(df["at_bat_number"], errors="coerce").astype("Int64")

# 1) Mark every pitch as Swing or Take
take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
combined["swing_flag"] = ~combined["description"].isin(take_events)

# 2) First-pitch swing flag per AB
first_pitches = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(subset=["game_pk","at_bat_number"], keep="first")
            [["game_pk","at_bat_number","swing_flag"]]
            .rename(columns={"swing_flag":"swung_first_pitch"})
)

# 3) Merge to ABs (prefer freshly computed flag)
ABs_flagged = ABs.merge(first_pitches, on=["game_pk","at_bat_number"], how="left", suffixes=("","_new"))
if "swung_first_pitch_new" in ABs_flagged.columns:
    ABs_flagged["swung_first_pitch"] = ABs_flagged["swung_first_pitch_new"].fillna(ABs_flagged.get("swung_first_pitch"))
    ABs_flagged.drop(columns=["swung_first_pitch_new"], inplace=True)
ABs_flagged["swung_first_pitch"] = ABs_flagged["swung_first_pitch"].astype("boolean")

# 4) Restrict to Success/Fail (job opps only)
job_ABs = ABs_flagged.loc[ABs_flagged["job_result"].isin(["Success","Fail"])].copy()

# 5) Job% by first-pitch decision
counts = (
    job_ABs.groupby(["swung_first_pitch","job_result"], dropna=False)
           .size()
           .unstack("job_result", fill_value=0)
)
for col in ("Success","Fail"):
    if col not in counts.columns:
        counts[col] = 0
counts["attempts"] = counts["Success"] + counts["Fail"]
counts["job_pct"] = counts["Success"].div(counts["attempts"]).where(counts["attempts"] != 0)

print("\n=== Job% by first-pitch swing (Success/(Success+Fail)) ===")
print(counts)

# (Optional) keep the augmented ABs
ABs = ABs_flagged
=== Job% by first-pitch swing (Success/(Success+Fail)) ===
job_result         Fail  Success  attempts   job_pct
swung_first_pitch                                   
False              3516     4235      7751  0.546381
True               2580     3272      5852  0.559125

Build per-pitch k_zone (Strike/Ball by geometry) and swing_flag, with types normalized.

In [227]:
# --- Normalize types we need ---
combined["plate_x"] = pd.to_numeric(combined["plate_x"], errors="coerce")
combined["plate_z"] = pd.to_numeric(combined["plate_z"], errors="coerce")
combined["sz_bot"]  = pd.to_numeric(combined["sz_bot"],  errors="coerce")
combined["sz_top"]  = pd.to_numeric(combined["sz_top"],  errors="coerce")
combined["game_pk"] = pd.to_numeric(combined["game_pk"], errors="coerce").astype("Int64")
combined["at_bat_number"] = pd.to_numeric(combined["at_bat_number"], errors="coerce").astype("Int64")

# --- k_zone: Strike/Ball by geometry (apply ball radius both axes) ---
BALL_RADIUS_FT = 0.12  # ~1.45 inches
in_x = combined["plate_x"].abs() <= (0.83 + BALL_RADIUS_FT)
in_z = (combined["plate_z"] + BALL_RADIUS_FT >= combined["sz_bot"]) & \
       (combined["plate_z"] - BALL_RADIUS_FT <= combined["sz_top"])
combined["k_zone"] = np.where(in_x & in_z, "Strike", "Ball")

# --- swing_flag for each pitch (True = offered) ---
take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
combined["description"] = combined["description"].astype(str).str.lower()
combined["swing_flag"] = ~combined["description"].isin(take_events)

Build AB-level first-pitch swing/take + zone table and report Job% across seven specs

In [228]:
required = {"game_pk","at_bat_number","pitch_number","description","k_zone"}
missing = required - set(combined.columns)
if missing:
    raise KeyError(f"combined is missing required columns: {missing}")

# Detect the job outcome column name
cands_exact = [c for c in combined.columns if c.lower() in {"job_result","job_outcome","joblabel","job_label"}]
cands_fuzzy = [c for c in combined.columns if ("job" in c.lower() and "result" in c.lower())]
job_cols = cands_exact or cands_fuzzy
if not job_cols:
    raise KeyError("Couldn't find a job outcome column (e.g., 'job_result'). "
                   "Please ensure combined has a column with 'Success'/'Fail' per AB (on last pitch).")
JOB_COL = job_cols[0]

# ---- 1) Ensure swing_flag exists (True = offered, False = take) ----
if "swing_flag" not in combined.columns:
    take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
    combined["description"] = combined["description"].astype(str).str.lower()
    combined["swing_flag"] = ~combined["description"].isin(take_events)

# ---- 2) First-pitch flags per AB ----
first_pitch = (
    combined
    .sort_values(["game_pk","at_bat_number","pitch_number"])
    .drop_duplicates(["game_pk","at_bat_number"], keep="first")
    [["game_pk","at_bat_number","swing_flag","k_zone"]]
    .rename(columns={"swing_flag":"swung_first_pitch",
                     "k_zone":"first_pitch_zone"})
)

# ---- 3) AB-level job outcome from the LAST pitch of each AB ----
ab_outcome = (
    combined
    .sort_values(["game_pk","at_bat_number","pitch_number"])
    .drop_duplicates(["game_pk","at_bat_number"], keep="last")
    [["game_pk","at_bat_number", JOB_COL]]
    .rename(columns={JOB_COL: "job_result"})
)

# Keep only ABs that are job opportunities (Success/Fail)
ab_outcome = ab_outcome[ab_outcome["job_result"].isin(["Success","Fail"])]

# ---- 4) Build AB-level table with first-pitch info + outcome ----
ABs_ab = (
    first_pitch
    .merge(ab_outcome, on=["game_pk","at_bat_number"], how="inner")
    .copy()
)

# ---- 5) Helper to compute Job% ----
def agg_job(df: pd.DataFrame) -> dict:
    if df.empty:
        return {"Success": 0, "Fail": 0, "Attempts": 0, "Job%": np.nan}
    s = (df["job_result"] == "Success").sum()
    f = (df["job_result"] == "Fail").sum()
    att = s + f
    return {"Success": s, "Fail": f, "Attempts": att, "Job%": (s / att) if att > 0 else np.nan}

# ---- 6) Build all seven specs ----
rows = []
def add_row(label, mask):
    d = agg_job(ABs_ab[mask]) if mask is not None else agg_job(ABs_ab)
    rows.append({"Spec": label, **d})

# 1. No specifications (total)
add_row("1) Total", None)

# 2. Swing first pitch
add_row("2) Swing first pitch", ABs_ab["swung_first_pitch"] == True)

# 3. Take first pitch
add_row("3) Take first pitch", ABs_ab["swung_first_pitch"] == False)

# 4. Swing first pitch (ball in zone)
add_row("4) Swing 1st (zone=Strike)",
        (ABs_ab["swung_first_pitch"] == True) & (ABs_ab["first_pitch_zone"] == "Strike"))

# 5. Swing first pitch (ball out of zone)
add_row("5) Swing 1st (zone=Ball)",
        (ABs_ab["swung_first_pitch"] == True) & (ABs_ab["first_pitch_zone"] == "Ball"))

# 6. Take first pitch (ball in zone)
add_row("6) Take 1st (zone=Strike)",
        (ABs_ab["swung_first_pitch"] == False) & (ABs_ab["first_pitch_zone"] == "Strike"))

# 7. Take first pitch (ball out of zone)
add_row("7) Take 1st (zone=Ball)",
        (ABs_ab["swung_first_pitch"] == False) & (ABs_ab["first_pitch_zone"] == "Ball"))

results_df = pd.DataFrame(rows)
results_df["Job%"] = results_df["Job%"].map(lambda x: f"{x:.2%}" if pd.notna(x) else "NA")
results_df
Out[228]:
Spec Success Fail Attempts Job%
0 1) Total 7507 6096 13603 55.19%
1 2) Swing first pitch 3272 2580 5852 55.91%
2 3) Take first pitch 4235 3516 7751 54.64%
3 4) Swing 1st (zone=Strike) 2664 1954 4618 57.69%
4 5) Swing 1st (zone=Ball) 608 626 1234 49.27%
5 6) Take 1st (zone=Strike) 1789 1682 3471 51.54%
6 7) Take 1st (zone=Ball) 2446 1834 4280 57.15%

Compute OBP/SLG/OPS splits by first-pitch swing/take and zone

In [229]:
# 0) Ensure swing_flag exists (True = swing; False = take)
if "swing_flag" not in combined.columns:
    take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
    combined["description"] = combined["description"].astype(str).str.lower()
    combined["swing_flag"] = ~combined["description"].isin(take_events)

# 1) First-pitch flags (swing + zone) per AB
first_pitch = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="first")
            [["game_pk","at_bat_number","swing_flag","k_zone"]]
            .rename(columns={"swing_flag":"swung_first_pitch",
                             "k_zone":"first_pitch_zone"})
)

# 2) Last-pitch outcome per AB (use `events`)
ab_last = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="last")
            [["game_pk","at_bat_number","events"]]
)

AB = first_pitch.merge(ab_last, on=["game_pk","at_bat_number"], how="inner").copy()
ev = AB["events"].astype(str).str.lower()

# 3) Build components for OBP & SLG
HITS   = {"single","double","triple","home_run"}
TB_MAP = {"single":1,"double":2,"triple":3,"home_run":4}
BB_SET = {"walk","intent_walk"}
HBP_SET = {"hit_by_pitch"}
SF_SET  = {"sac_fly"}
# plate appearances that are NOT at-bats (include both CI spellings)
NON_AB = {"walk","intent_walk","hit_by_pitch","sac_bunt","sac_fly","catcher_interf","catcher_interference"}

AB["H"]   = ev.isin(HITS).astype(int)
AB["BB"]  = ev.isin(BB_SET).astype(int)
AB["HBP"] = ev.isin(HBP_SET).astype(int)
AB["SF"]  = ev.isin(SF_SET).astype(int)
AB["TB"]  = ev.map(TB_MAP).fillna(0).astype(int)
AB["AB"]  = (~ev.isin(NON_AB)).astype(int)

def obp_slg(df: pd.DataFrame) -> dict:
    H   = int(df["H"].sum())
    BB  = int(df["BB"].sum())
    HBP = int(df["HBP"].sum())
    SF  = int(df["SF"].sum())
    ABs = int(df["AB"].sum())
    TB  = int(df["TB"].sum())
    denom_obp = ABs + BB + HBP + SF
    obp = (H + BB + HBP) / denom_obp if denom_obp > 0 else np.nan
    slg = TB / ABs if ABs > 0 else np.nan
    return {"PA": len(df), "AB": ABs, "H": H, "BB": BB, "HBP": HBP, "SF": SF, "TB": TB, "OBP": obp, "SLG": slg}

# 4) Masks for your 7 specifications
M_ALL = pd.Series(True, index=AB.index)
M_SW  = AB["swung_first_pitch"] == True
M_TK  = AB["swung_first_pitch"] == False
M_IN  = AB["first_pitch_zone"] == "Strike"
M_OUT = AB["first_pitch_zone"] == "Ball"

specs = [
    ("1) Total",                    M_ALL),
    ("2) Swing first pitch",        M_SW),
    ("3) Take first pitch",         M_TK),
    ("4) Swing 1st (zone=Strike)",  M_SW & M_IN),
    ("5) Swing 1st (zone=Ball)",    M_SW & M_OUT),
    ("6) Take 1st (zone=Strike)",   M_TK & M_IN),
    ("7) Take 1st (zone=Ball)",     M_TK & M_OUT),
]

rows = []
for label, mask in specs:
    rows.append({"Spec": label, **obp_slg(AB[mask])})

obp_slg_df = pd.DataFrame(rows)
obp_slg_df["OBP"] = obp_slg_df["OBP"].round(3)
obp_slg_df["SLG"] = obp_slg_df["SLG"].round(3)
obp_slg_df["OPS"] = (obp_slg_df["OBP"] + obp_slg_df["SLG"]).round(3)

# Tidy columns
obp_slg_df = obp_slg_df[["Spec","PA","OBP","SLG","OPS"]]
obp_slg_df
Out[229]:
Spec PA OBP SLG OPS
0 1) Total 15293 0.339 0.470 0.809
1 2) Swing first pitch 6329 0.306 0.488 0.794
2 3) Take first pitch 8964 0.361 0.456 0.817
3 4) Swing 1st (zone=Strike) 4997 0.312 0.515 0.827
4 5) Swing 1st (zone=Ball) 1332 0.284 0.393 0.677
5 6) Take 1st (zone=Strike) 3833 0.298 0.406 0.704
6 7) Take 1st (zone=Ball) 5131 0.408 0.500 0.908

Build AB-level table with first-pitch data for each team

In [230]:
# --- Build AB-level table with batting_team from the FIRST pitch of each AB ---
first_pitch = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="first")
            [["game_pk","at_bat_number","swing_flag","k_zone","home_team","away_team","inning_topbot"]]
            .rename(columns={"swing_flag":"swung_first_pitch",
                             "k_zone":"first_pitch_zone"})
)

# Derive batting team: Top = away bats, Bottom = home bats
top = first_pitch["inning_topbot"].astype(str).str.lower().str.startswith("top")
first_pitch["batting_team"] = np.where(top, first_pitch["away_team"], first_pitch["home_team"])

# Last pitch from pitch-level 'combined' (events only)
last_pitch = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="last")
            [["game_pk","at_bat_number","events"]]
)

# Merge first/last pitch into AB table
AB = first_pitch.merge(last_pitch, on=["game_pk","at_bat_number"], how="inner").copy()

# Attach PA-level job_result from ABs (PA table). Do NOT overwrite job_result elsewhere.
AB = AB.merge(
    ABs[["game_pk","at_bat_number","job_result"]],
    on=["game_pk","at_bat_number"],
    how="left"
)

AB["swung_first_pitch"] = AB["swung_first_pitch"].astype(bool)
AB["first_pitch_zone"]  = AB["first_pitch_zone"].astype(str)

# --- Prepare hitting components (H, TB, AB flag) from events ---
HITS   = {"single","double","triple","home_run"}
TB_MAP = {"single":1,"double":2,"triple":3,"home_run":4}
BB_SET = {"walk","intent_walk"}
HBP_SET= {"hit_by_pitch"}
SF_SET = {"sac_fly"}

# Correct Statcast token for catcher interference
NON_AB = {"walk","intent_walk","hit_by_pitch","sac_bunt","sac_fly","catcher_interference"}

ev = AB["events"].astype(str).str.lower()
AB["H"]   = ev.isin(HITS).astype(int)
AB["TB"]  = ev.map(TB_MAP).fillna(0).astype(int)
AB["BB"]  = ev.isin(BB_SET).astype(int)
AB["HBP"] = ev.isin(HBP_SET).astype(int)
AB["SF"]  = ev.isin(SF_SET).astype(int)
AB["AB"]  = (~ev.isin(NON_AB)).astype(int)

# --- Spec masks (functions so they can be applied per-group) ---
def spec_masks(df):
    M_SW  = df["swung_first_pitch"] == True
    M_TK  = df["swung_first_pitch"] == False
    M_IN  = df["first_pitch_zone"] == "Strike"
    M_OUT = df["first_pitch_zone"] == "Ball"
    return [
        ("1) Total",                     pd.Series(True, index=df.index)),
        ("2) Swing first pitch",         M_SW),
        ("3) Take first pitch",          M_TK),
        ("4) Swing 1st (zone=Strike)",   M_SW & M_IN),
        ("5) Swing 1st (zone=Ball)",     M_SW & M_OUT),
        ("6) Take 1st (zone=Strike)",    M_TK & M_IN),
        ("7) Take 1st (zone=Ball)",      M_TK & M_OUT),
    ]

# --- Per-team 7-spec table with PA, AB, H, OBP, SLG, OPS ---
def team_hitting_counts_table(AB: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for team, g in AB.groupby("batting_team", sort=True):
        for label, m in spec_masks(g):
            sub = g[m]
            PA  = len(sub)
            ABs = int(sub["AB"].sum())
            H   = int(sub["H"].sum())
            TB  = int(sub["TB"].sum())
            BB  = int(sub["BB"].sum())
            HBP = int(sub["HBP"].sum())
            SF  = int(sub["SF"].sum())

            denom_obp = ABs + BB + HBP + SF
            obp = (H + BB + HBP) / denom_obp if denom_obp > 0 else np.nan
            slg = (TB / ABs) if ABs > 0 else np.nan
            ops = (obp + slg) if (pd.notna(obp) and pd.notna(slg)) else np.nan

            rows.append({
                "team": team,
                "Spec": label,
                "PA": PA,
                "AB": ABs,
                "H": H,
                "OBP": round(obp, 3) if pd.notna(obp) else np.nan,
                "SLG": round(slg, 3) if pd.notna(slg) else np.nan,
                "OPS": round(ops, 3) if pd.notna(ops) else np.nan,
            })
    out = pd.DataFrame(rows)
    # Sort by team then natural Spec order (1..7) using the leading digit
    out["spec_order"] = out["Spec"].str.extract(r"^(\d+)").astype(int)
    out = out.sort_values(["team","spec_order"]).drop(columns="spec_order").reset_index(drop=True)
    return out

team_hitting_counts_df = team_hitting_counts_table(AB)
team_hitting_counts_df.head(10)
Out[230]:
team Spec PA AB H OBP SLG OPS
0 ATH 1) Total 475 381 107 0.335 0.491 0.826
1 ATH 2) Swing first pitch 214 186 53 0.283 0.484 0.767
2 ATH 3) Take first pitch 261 195 54 0.377 0.497 0.874
3 ATH 4) Swing 1st (zone=Strike) 163 143 40 0.269 0.510 0.780
4 ATH 5) Swing 1st (zone=Ball) 51 43 13 0.327 0.395 0.722
5 ATH 6) Take 1st (zone=Strike) 93 76 20 0.301 0.474 0.775
6 ATH 7) Take 1st (zone=Ball) 168 119 34 0.419 0.513 0.932
7 ATL 1) Total 534 437 114 0.319 0.400 0.720
8 ATL 2) Swing first pitch 201 169 39 0.242 0.367 0.609
9 ATL 3) Take first pitch 333 268 75 0.364 0.422 0.786

Build AB table for Job% splits by team

In [231]:
# --- First pitch (PA-level keys from pitch-level) ---
first_pitch = (
    combined.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="first")
            [["game_pk","at_bat_number","swing_flag","k_zone","home_team","away_team","inning_topbot"]]
            .rename(columns={"swing_flag":"swung_first_pitch",
                             "k_zone":"first_pitch_zone"})
)

# Batting team: Top = away, Bottom = home
top = first_pitch["inning_topbot"].astype(str).str.lower().str.startswith("top")
first_pitch["batting_team"] = np.where(top, first_pitch["away_team"], first_pitch["home_team"])

# --- Merge PA-level job_result from ABs (authoritative) ---
AB = first_pitch.merge(
    ABs[["game_pk","at_bat_number","job_result"]],
    on=["game_pk","at_bat_number"],
    how="left"
).copy()

# Types for masks
AB["swung_first_pitch"] = AB["swung_first_pitch"].astype(bool)
AB["first_pitch_zone"]  = AB["first_pitch_zone"].astype(str)

# --- Spec masks ---
def spec_masks(df):
    M_SW  = df["swung_first_pitch"] == True
    M_TK  = df["swung_first_pitch"] == False
    M_IN  = df["first_pitch_zone"] == "Strike"
    M_OUT = df["first_pitch_zone"] == "Ball"
    return [
        ("1) Total",                     pd.Series(True, index=df.index)),
        ("2) Swing first pitch",         M_SW),
        ("3) Take first pitch",          M_TK),
        ("4) Swing 1st (zone=Strike)",   M_SW & M_IN),
        ("5) Swing 1st (zone=Ball)",     M_SW & M_OUT),
        ("6) Take 1st (zone=Strike)",    M_TK & M_IN),
        ("7) Take 1st (zone=Ball)",      M_TK & M_OUT),
    ]

# --- Per-team Job% table ---
def team_job_table(AB: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for team, g in AB.groupby("batting_team", sort=True):
        for label, mask in spec_masks(g):
            sub = g[mask]
            s = (sub["job_result"]=="Success").sum()
            f = (sub["job_result"]=="Fail").sum()
            attempts = int(s + f)
            pct = (s / attempts) if attempts > 0 else np.nan
            rows.append({
                "team": team,
                "Spec": label,
                "Success": int(s),
                "Fail": int(f),
                "Attempts": attempts,
                "Job%": f"{pct:.2%}" if attempts > 0 else "NA"
            })
    out = pd.DataFrame(rows)
    out["spec_order"] = out["Spec"].str.extract(r"^(\d+)").astype(int)
    out = out.sort_values(["team","spec_order"]).drop(columns="spec_order").reset_index(drop=True)
    return out

team_job_df = team_job_table(AB)
team_job_df.head(12)
Out[231]:
team Spec Success Fail Attempts Job%
0 ATH 1) Total 218 189 407 53.56%
1 ATH 2) Swing first pitch 110 85 195 56.41%
2 ATH 3) Take first pitch 108 104 212 50.94%
3 ATH 4) Swing 1st (zone=Strike) 84 64 148 56.76%
4 ATH 5) Swing 1st (zone=Ball) 26 21 47 55.32%
5 ATH 6) Take 1st (zone=Strike) 42 41 83 50.60%
6 ATH 7) Take 1st (zone=Ball) 66 63 129 51.16%
7 ATL 1) Total 257 225 482 53.32%
8 ATL 2) Swing first pitch 102 87 189 53.97%
9 ATL 3) Take first pitch 155 138 293 52.90%
10 ATL 4) Swing 1st (zone=Strike) 86 64 150 57.33%
11 ATL 5) Swing 1st (zone=Ball) 16 23 39 41.03%

Backfill first-pitch swing_flag correctly

In [232]:
if "swing_flag" not in combined.columns:
    swing_like = {
        "swinging_strike", "swinging_strike_blocked",
        "foul", "foul_tip", "foul_bunt", "bunt_foul_tip",
        "missed_bunt",
        "hit_into_play", "hit_into_play_no_out", "hit_into_play_score"
    }
    desc = combined["description"].astype(str).str.lower()
    combined["swing_flag"] = desc.isin(swing_like).astype(bool)

Get Job% when first pitch is swung at per zone(k zone strike)

In [233]:
try:
    df = combined.copy()
except NameError:
    df = pd.read_csv("/mnt/data/comb.csv")

# ==========================================================
# Config (match Code 1 exactly)
# ==========================================================
LEFT_EDGE, RIGHT_EDGE = -0.83, 0.83
V_BUF = 0.12
WIDTH = RIGHT_EDGE - LEFT_EDGE
THIRD = WIDTH / 3.0
H_BINS   = [LEFT_EDGE, LEFT_EDGE+THIRD, LEFT_EDGE+2*THIRD, RIGHT_EDGE]
H_LABELS = ["Left","Middle","Right"]
V_LABELS = ["High","Mid","Low"]
GCOLS = ["game_pk","at_bat_number"]

# Swing/take logic EXACTLY like Code 1 (no automatic_* here)
TAKE_EVENTS = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}

# ==========================================================
# 1) First pitch (include geometry) + last-pitch job outcome
# ==========================================================
df = df.sort_values(GCOLS + ["pitch_number"])

# Ensure swing_flag exists with Code-1 rules
if "swing_flag" not in df.columns:
    desc = df["description"].astype(str).str.lower()
    df["swing_flag"] = ~desc.isin(TAKE_EVENTS)

# Require k_zone to come from the dataset (like Code 1)
if "k_zone" not in df.columns:
    raise KeyError("combined must include k_zone to match Code 1.")

# First-pitch features INCLUDING geometry (so we can 3×3 later)
first = (
    df.drop_duplicates(GCOLS, keep="first")
      [["game_pk","at_bat_number","swing_flag","k_zone","plate_x","plate_z","sz_top","sz_bot"]]
      .rename(columns={"swing_flag":"swung_first_pitch",
                       "k_zone":"first_pitch_zone"})
      .copy()
)

# Last-pitch Job outcome (normalize; keep all, then filter to attempts)
# Detect job_result column like Code 1
cands_exact = [c for c in df.columns if c.lower() in {"job_result","job_outcome","joblabel","job_label"}]
cands_fuzzy = [c for c in df.columns if ("job" in c.lower() and "result" in c.lower())]
JOB_COL = (cands_exact or cands_fuzzy)[0]

last = (
    df.drop_duplicates(GCOLS, keep="last")
      [GCOLS + [JOB_COL]]
      .rename(columns={JOB_COL:"job_result"})
      .copy()
)
last["job_result"] = (last["job_result"].astype(str)
                      .str.strip().str.title()
                      .replace({"Failure":"Fail","Failed":"Fail"}))

# AB-level table like Code 1
ABs_ab = first.merge(last, on=GCOLS, how="inner")

# Filter to Job attempts (Success/Fail) — EXACTLY Code 1 behavior
ABs_ab = ABs_ab[ABs_ab["job_result"].isin(["Success","Fail"])].copy()

# ==========================================================
# 2) Build the EXACT Code-1 spec: Swing 1st (zone=Strike)
# ==========================================================
spec_mask = (ABs_ab["swung_first_pitch"].astype(bool)) & (ABs_ab["first_pitch_zone"] == "Strike")
base = ABs_ab.loc[spec_mask].copy()

# Overall Job% for this spec (matches Code 1)
s_all = (base["job_result"] == "Success").sum()
f_all = (base["job_result"] == "Fail").sum()
att_all = s_all + f_all
job_overall = s_all / att_all if att_all > 0 else np.nan

# ==========================================================
# 3) 3×3 MLB grid (placeable rows only) + reconciliation
# ==========================================================
# Keep only rows with full geometry (these are the ones your grid can place)
geom_mask = base[["plate_x","plate_z","sz_top","sz_bot"]].notna().all(axis=1)
placed = base.loc[geom_mask].copy()
dropped = att_all - len(placed)

# Horizontal thirds within [-0.83, +0.83]
placed["Hbin"] = pd.cut(
    placed["plate_x"].clip(LEFT_EDGE, RIGHT_EDGE),
    bins=H_BINS, labels=H_LABELS, include_lowest=True
)

# Vertical thirds by per-pitch normalization
span = (placed["sz_top"] - placed["sz_bot"]).replace(0, np.nan)
z = ((placed["plate_z"] - placed["sz_bot"]) / span).clip(0, 1)
placed["Vbin"] = pd.cut(z, bins=[-np.inf, 1/3, 2/3, np.inf], labels=["Low","Mid","High"])
placed["Vbin"] = pd.Categorical(placed["Vbin"], categories=["High","Mid","Low"], ordered=True)

placed = placed.dropna(subset=["Hbin","Vbin"])

# Crosstabs (Success/Fail counts are already implied by job_result)
is_success = placed["job_result"].eq("Success")
job_num  = pd.crosstab(placed.loc[is_success, "Vbin"], placed.loc[is_success, "Hbin"])
attempts = pd.crosstab(placed["Vbin"], placed["Hbin"])

job_num  = job_num.reindex(index=V_LABELS, columns=H_LABELS, fill_value=0)
attempts = attempts.reindex(index=V_LABELS, columns=H_LABELS, fill_value=0)

job_pct_9 = (job_num / attempts).replace([np.nan, np.inf], 0.0)
weighted_job_placed = job_num.values.sum() / attempts.values.sum() if attempts.values.sum() else np.nan

# ==========================================================
# 4) Print reconciliation
# ==========================================================
print("=== SPEC: 4) Swing 1st (zone=Strike) — EXACT Code 1 Cohort ===")
print(f"Total attempts in spec (Success+Fail): {att_all}")
print(f"Placed in 3×3 (have full geometry):   {len(placed)}")
print(f"Dropped for missing geometry:          {dropped}\n")

print("=== Job% overall (Code 1 cohort, no geometry filter) ===")
print(f"{job_overall:.3f}\n")

print("=== Job% by 3×3 (placed-only) ===")
print(job_pct_9.round(3))
print("\n=== Attempts (3×3 placed-only) ===")
print(attempts)

print(f"\nWeighted Job% across 3×3 (placed-only): {weighted_job_placed:.3f}")

if not np.isnan(job_overall) and not np.isnan(weighted_job_placed):
    diff = weighted_job_placed - job_overall
    print(f"Difference vs overall spec: {diff:+.3f}  (caused by excluded geometry rows and/or zone-edge clipping)")
=== SPEC: 4) Swing 1st (zone=Strike) — EXACT Code 1 Cohort ===
Total attempts in spec (Success+Fail): 4618
Placed in 3×3 (have full geometry):   4618
Dropped for missing geometry:          0

=== Job% overall (Code 1 cohort, no geometry filter) ===
0.577

=== Job% by 3×3 (placed-only) ===
Hbin   Left  Middle  Right
Vbin                      
High  0.580   0.560  0.568
Mid   0.588   0.605  0.616
Low   0.559   0.575  0.519

=== Attempts (3×3 placed-only) ===
Hbin  Left  Middle  Right
Vbin                     
High   412     530    370
Mid    570     681    552
Low    422     609    472

Weighted Job% across 3×3 (placed-only): 0.577
Difference vs overall spec: +0.000  (caused by excluded geometry rows and/or zone-edge clipping)

Compare K zone first pitch swings to overall job%

In [234]:
# --- Data ---
BASELINE = 0.552
vals = np.array([
    [0.586, 0.560, 0.568],
    [0.581, 0.605, 0.617],
    [0.572, 0.575, 0.529],
])
rows = ["High", "Mid", "Low"]
cols = ["Left", "Middle", "Right"]

# --- Centered colormap around baseline ---
diff = max(vals.max() - BASELINE, BASELINE - vals.min())
norm = TwoSlopeNorm(vmin=BASELINE - diff, vcenter=BASELINE, vmax=BASELINE + diff)

# --- Larger figure for visibility in Jupyter ---
fig, ax = plt.subplots(figsize=(6.5, 6.5), dpi=150)

# --- Heatmap ---
im = ax.imshow(vals, cmap="RdBu_r", norm=norm, origin="upper")

# --- Grid lines ---
for k in range(4):
    ax.axhline(k - 0.5, color="black", lw=1)
    ax.axvline(k - 0.5, color="black", lw=1)

# --- Value labels (white text + black outline for max visibility) ---
for r in range(vals.shape[0]):
    for c in range(vals.shape[1]):
        ax.text(c, r, f"{vals[r,c]:.3f}",
                ha="center", va="center",
                fontsize=11, fontweight="bold",
                color="white",
                path_effects=[pe.withStroke(linewidth=1.5, foreground='black')])

# --- Axis labels ---
ax.set_xticks(range(3), cols)
ax.set_yticks(range(3), rows)

# --- Title with padding ---
ax.set_title(f"Job% on 0–0 Swings at First-Pitch Strikes\n3×3 (baseline = {BASELINE:.3f})",
             pad=15)

# --- Axis limits ---
ax.set_xlim(-0.5, 2.5)
ax.set_ylim(2.5, -0.5)

# --- Colorbar ---
cb = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.06)
cb.set_label("Job%")

# --- Tight layout to avoid clipping ---
plt.tight_layout()
plt.show()
No description has been provided for this image

read in two strikes csv

In [235]:
two_strikes = pd.read_csv("2strikes.csv")

Get swing percentage on pitches in and out of the zone for different 2 strike counts

In [236]:
def ensure_swing_flag(df: pd.DataFrame) -> pd.DataFrame:
    if "swing_flag" in df.columns:
        df["swing_flag"] = df["swing_flag"].astype(bool)
        return df
    take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
    desc = df["description"].astype(str).str.lower()
    df["swing_flag"] = ~desc.isin(take_events)
    return df

def ensure_k_zone(df: pd.DataFrame) -> pd.DataFrame:
    if "k_zone" in df.columns:
        return df
    BR = 0.12
    for c in ["plate_x","plate_z","sz_bot","sz_top"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    in_x = df["plate_x"].abs() <= 0.83
    in_z = (df["plate_z"] + BR >= df["sz_bot"]) & (df["plate_z"] - BR <= df["sz_top"])
    df["k_zone"] = np.where(in_x & in_z, "Strike", "Ball")
    return df

def add_count_cols(df: pd.DataFrame) -> pd.DataFrame:
    cl = {c.lower(): c for c in df.columns}
    if "balls" in cl and "strikes" in cl:
        b = pd.to_numeric(df[cl["balls"]], errors="coerce").astype("Int64")
        s = pd.to_numeric(df[cl["strikes"]], errors="coerce").astype("Int64")
    elif "count" in cl:
        cnt = df[cl["count"]].astype(str)
        m = cnt.str.extract(r"(?P<b>\d)\s*[-–]\s*(?P<s>\d)")
        b = pd.to_numeric(m["b"], errors="coerce").astype("Int64")
        s = pda.to_numeric(m["s"], errors="coerce").astype("Int64")
    else:
        raise KeyError("Need 'balls' & 'strikes' or a 'count' column (e.g., '3-2').")
    df = df.assign(balls=b, strikes=s)
    df["count_str"] = df["balls"].astype(str) + "-" + df["strikes"].astype(str)
    return df

def swing_pivot(df: pd.DataFrame, by=None) -> pd.DataFrame:
    if by is None:
        df = df.assign(__grp="All")
        by = ["__grp"]
    g = (df.groupby(by + ["k_zone"])["swing_flag"]
           .agg(swung="sum", total="count")
           .reset_index())
    out = g.pivot(index=by, columns="k_zone", values=["swung","total"]).fillna(0).astype(int)
    for z in ["Strike","Ball"]:
        if ("swung", z) not in out.columns: out[("swung", z)] = 0
        if ("total", z) not in out.columns: out[("total", z)] = 0
    out[("pct","Strike")] = np.where(out[("total","Strike")] > 0, out[("swung","Strike")] / out[("total","Strike")], np.nan)
    out[("pct","Ball")]   = np.where(out[("total","Ball")]   > 0, out[("swung","Ball")]   / out[("total","Ball")],   np.nan)
    return out

def baseball_order(idx):
    bs = []
    for c in idx:
        try:
            b,s = c.split("-"); b,s = int(b), int(s)
        except Exception:
            b,s = 9,9
        bs.append((b,s,c))
    # keep only two-strike counts
    bs = [t for t in bs if 0 <= t[0] <= 3 and t[1] == 2]
    return [t[2] for t in sorted(bs, key=lambda x: (x[0], x[1]))]

# --- compute on two_strikes ---
df = two_strikes.copy()
df = ensure_swing_flag(df)
df = ensure_k_zone(df)
df = add_count_cols(df)
df = df.dropna(subset=["k_zone","swing_flag","balls","strikes"])

# (1) Overall two-strike swing% by zone
overall = swing_pivot(df, by=None)

# (2) By specific two-strike counts (0-2, 1-2, 2-2)
by_count = swing_pivot(df, by=["count_str"])
order = baseball_order(by_count.index)
by_count = by_count.loc[order]

# --- print results ---
# Overall
s_sw = overall.iloc[0][("swung","Strike")]; s_to = overall.iloc[0][("total","Strike")]; s_pc = overall.iloc[0][("pct","Strike")]
b_sw = overall.iloc[0][("swung","Ball")];   b_to = overall.iloc[0][("total","Ball")];   b_pc = overall.iloc[0][("pct","Ball")]
print("=== Two-Strike Overall — counts & swing% ===")
print(f"Strike {int(s_sw)}/{int(s_to)} ({'NA' if pd.isna(s_pc) else f'{s_pc*100:.1f}%'}), "
      f"Ball {int(b_sw)}/{int(b_to)} ({'NA' if pd.isna(b_pc) else f'{b_pc*100:.1f}%'})")

# By count
print("\n=== By Count (0-2, 1-2, 2-2) — counts & swing% ===")
for c in by_count.index:
    s_sw = by_count.loc[c, ("swung","Strike")]; s_to = by_count.loc[c, ("total","Strike")]; s_pc = by_count.loc[c, ("pct","Strike")]
    b_sw = by_count.loc[c, ("swung","Ball")];   b_to = by_count.loc[c, ("total","Ball")];   b_pc = by_count.loc[c, ("pct","Ball")]
    s_pct_str = "NA" if pd.isna(s_pc) else f"{s_pc*100:.1f}%"
    b_pct_str = "NA" if pd.isna(b_pc) else f"{b_pc*100:.1f}%"
    print(f"{c}:  Strike {int(s_sw)}/{int(s_to)} ({s_pct_str}) | Ball {int(b_sw)}/{int(b_to)} ({b_pct_str})")
=== Two-Strike Overall — counts & swing% ===
Strike 9609/10865 (88.4%), Ball 5279/14135 (37.3%)

=== By Count (0-2, 1-2, 2-2) — counts & swing% ===
0-2:  Strike 1591/1830 (86.9%) | Ball 1190/3791 (31.4%)
1-2:  Strike 2878/3239 (88.9%) | Ball 1854/5004 (37.1%)
2-2:  Strike 2944/3308 (89.0%) | Ball 1525/3672 (41.5%)
3-2:  Strike 2196/2488 (88.3%) | Ball 710/1668 (42.6%)

Graph first pitch and 2 strike swing and take metrics

In [237]:
def ensure_swing_flag(df: pd.DataFrame) -> pd.DataFrame:
    if "swing_flag" in df.columns:
        df["swing_flag"] = df["swing_flag"].astype(bool)
        return df
    take_events = {"ball","called_strike","pitchout","blocked_ball","intent_ball","hit_by_pitch"}
    desc = df["description"].astype(str).str.lower()
    df["swing_flag"] = ~desc.isin(take_events)
    return df

def ensure_k_zone(df: pd.DataFrame) -> pd.DataFrame:
    if "k_zone" in df.columns:
        return df
    BR = 0.12
    for c in ["plate_x","plate_z","sz_bot","sz_top"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    in_x = df["plate_x"].abs() <= 0.83
    in_z = (df["plate_z"] + BR >= df["sz_bot"]) & (df["plate_z"] - BR <= df["sz_top"])
    df["k_zone"] = np.where(in_x & in_z, "Strike", "Ball")
    return df

def swing_table(df: pd.DataFrame) -> pd.DataFrame:
    g = (df.groupby("k_zone")["swing_flag"]
           .agg(swung="sum", total="count")
           .reindex(["Strike","Ball"])
           .fillna(0))
    g["swung"] = g["swung"].astype(int)
    g["total"] = g["total"].astype(int)
    g["pct"] = np.where(g["total"] > 0, g["swung"] / g["total"], np.nan)
    return g

# --- A) First pitch of job ABs (combined) ---
try:
    combined
except NameError:
    combined = pd.read_csv("/mnt/data/comb.csv")

dfc = combined.copy()
dfc = ensure_swing_flag(dfc)
dfc = ensure_k_zone(dfc)
dfc_fp = (dfc.sort_values(["game_pk","at_bat_number","pitch_number"])
            .drop_duplicates(["game_pk","at_bat_number"], keep="first")
            .dropna(subset=["k_zone","swing_flag"]))
tab_first = swing_table(dfc_fp)

# --- B) Two-strike overall (two_strikes) ---
try:
    two_strikes
except NameError:
    two_strikes = pd.read_csv("/mnt/data/two_strikes.csv")

ts = two_strikes.copy()
ts = ensure_swing_flag(ts)
ts = ensure_k_zone(ts)
ts = ts.dropna(subset=["k_zone","swing_flag"])
tab_2s = swing_table(ts)

# --- prep data for plot ---
zones = ["Strike","Ball"]

first_rates = tab_first.loc[zones, "pct"].to_numpy()
two_rates   = tab_2s.loc[zones, "pct"].to_numpy()

first_swung = tab_first.loc[zones, "swung"].to_numpy()
first_total = tab_first.loc[zones, "total"].to_numpy()
two_swung   = tab_2s.loc[zones, "swung"].to_numpy()
two_total   = tab_2s.loc[zones, "total"].to_numpy()

# --- plot (two series only) ---
x = np.arange(len(zones))
width = 0.35

fig, ax = plt.subplots(figsize=(7, 4.5), dpi=150)
b1 = ax.bar(x - width/2, first_rates, width, label="First Pitch (Job Opp)")
b2 = ax.bar(x + width/2, two_rates,   width, label="Two Strikes (Overall)")

ax.set_title("Swing% by Zone: 0–0(Job) vs Two Strikes")
ax.set_ylabel("Swing rate")
ax.set_xticks(x, zones)
ax.set_ylim(0, 1.10)  # a bit more headroom for labels
ax.legend()

# annotate each bar with "xx.x% (swung/total)"
series = [
    (b1, first_rates, first_swung, first_total),
    (b2, two_rates,   two_swung,   two_total),
]

for bars, rates, swung_arr, total_arr in series:
    for j, b in enumerate(bars):
        h = rates[j]
        s = int(swung_arr[j]) if j < len(swung_arr) else 0
        t = int(total_arr[j]) if j < len(total_arr) else 0
        if np.isfinite(h) and t > 0:
            label = f"{h*100:.1f}% ({s}/{t})"
        else:
            label = "N/A"
        ax.text(b.get_x() + b.get_width()/2, (h if np.isfinite(h) else 0) + 0.03,
                label, ha="center", va="bottom", fontsize=9)

plt.tight_layout()
plt.show()
No description has been provided for this image

Run simulation of job percentage change and season wins added given 2 strike approach on first pitch of job opportuntity at bats

In [238]:
# Team-specific hypothetical: reallocate 0–0 decisions using each TEAM's own payoffs
# Output: 30 rows (one per team) with:
#  team, Attempts, Success (actual), Success_new (hypothetical), JobPct, JobPct_new, Delta_success
import numpy as np
import pandas as pd
import re

# ------- config -------
P_STRIKE_SWING = 0.884   # swing% on 0–0 strikes (from your two-strike study)
P_BALL_SWING   = 0.373   # swing% on 0–0 balls

SPEC_TOTAL       = "1) Total"
SPEC_STRIKE_SW   = "4) Swing 1st (zone=Strike)"
SPEC_BALL_SW     = "5) Swing 1st (zone=Ball)"
SPEC_STRIKE_TK   = "6) Take 1st (zone=Strike)"
SPEC_BALL_TK     = "7) Take 1st (zone=Ball)"
FOUR_SPECS = [SPEC_STRIKE_SW, SPEC_BALL_SW, SPEC_STRIKE_TK, SPEC_BALL_TK]

def _clean_cols(df):
    df = df.copy()
    df.columns = [re.sub(r"\s+", " ", str(c).strip()) for c in df.columns]
    return df

def _pick(df, names):
    lower = {c.lower(): c for c in df.columns}
    for n in names:
        if n.lower() in lower:
            return lower[n.lower()]
    return None

# ------- attach input -------
try:
    team_job_df
except NameError:
    team_job_df = pd.read_csv("/mnt/data/team_job_percent.csv")

t = _clean_cols(team_job_df)
c_team = _pick(t, ["team","Team","batting_team","club","org"])
c_spec = _pick(t, ["Spec","spec","Bucket","Label"])
c_succ = _pick(t, ["Success","succ"])
c_fail = _pick(t, ["Fail","failure","fails"])
c_att  = _pick(t, ["Attempts","attempts","Att"])
if not all([c_team, c_spec, c_succ, c_fail, c_att]):
    raise KeyError("team_job_df must have columns: team, Spec, Success, Fail, Attempts.")

t = t.rename(columns={c_team:"team", c_spec:"Spec", c_succ:"Success", c_fail:"Fail", c_att:"Attempts"})
t[["Success","Fail","Attempts"]] = t[["Success","Fail","Attempts"]].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)

# ------- league fallbacks per spec (in case a team has 0 attempts in a bucket) -------
league_totals = (t[t["Spec"].isin(FOUR_SPECS)]
                 .groupby("Spec")[["Success","Attempts"]].sum())
league_rates = (league_totals["Success"] / league_totals["Attempts"]).replace([np.inf, -np.inf], np.nan)

# ------- team-specific rates & attempts in the 4 buckets -------
four = t[t["Spec"].isin(FOUR_SPECS)].copy()
four["JobRate"] = np.where(four["Attempts"] > 0, four["Success"] / four["Attempts"], np.nan)

# attempts wide
A = four.pivot_table(index="team", columns="Spec", values="Attempts", aggfunc="sum").reindex(columns=FOUR_SPECS, fill_value=0)
# rates wide (fill missing with league)
R = four.pivot_table(index="team", columns="Spec", values="JobRate", aggfunc="mean").reindex(columns=FOUR_SPECS)
for spec in FOUR_SPECS:
    R[spec] = R[spec].fillna(league_rates.get(spec, np.nan))

# ensure teams with missing specs still appear
teams = sorted(t["team"].unique())
A = A.reindex(teams).fillna(0).astype(int)
R = R.reindex(teams)

# ------- current totals per team (from "1) Total") -------
totals = (t[t["Spec"].eq(SPEC_TOTAL)][["team","Success","Fail","Attempts"]]
          .drop_duplicates()
          .set_index("team"))
# fallback if "1) Total" missing: sum across all specs for that team
missing = set(teams) - set(totals.index)
if missing:
    fb = (t.groupby("team")[["Success","Fail","Attempts"]].sum())
    totals = pd.concat([totals, fb.loc[list(missing)]], axis=0)

# ------- reallocate decisions under new approach (attempts stay the same) -------
N_strike = (A[SPEC_STRIKE_SW] + A[SPEC_STRIKE_TK]).astype(int)
N_ball   = (A[SPEC_BALL_SW]   + A[SPEC_BALL_TK]).astype(int)

S_strike_new = np.rint(N_strike * P_STRIKE_SWING).astype(int)
T_strike_new = N_strike - S_strike_new
S_ball_new   = np.rint(N_ball   * P_BALL_SWING).astype(int)
T_ball_new   = N_ball   - S_ball_new

# ------- expected successes using TEAM-SPECIFIC payoffs -------
succ_new = (
    S_strike_new * R[SPEC_STRIKE_SW] +
    T_strike_new * R[SPEC_STRIKE_TK] +
    S_ball_new   * R[SPEC_BALL_SW]   +
    T_ball_new   * R[SPEC_BALL_TK]
).round().astype(int)  # round to nearest whole success

# ------- assemble comparison table -------
out = pd.DataFrame({
    "Team": teams,
    "Attempts": totals.loc[teams, "Attempts"].values,
    "Success":  totals.loc[teams, "Success"].values,
    "Success_new": succ_new.reindex(teams).fillna(0).astype(int).values
})
out["JobPct"]     = np.where(out["Attempts"]>0, out["Success"]     / out["Attempts"], np.nan)
out["JobPct_new"] = np.where(out["Attempts"]>0, out["Success_new"] / out["Attempts"], np.nan)
out["Delta_success"] = out["Success_new"] - out["Success"]

# order & (optional) pretty %
out = out.sort_values("Delta_success", ascending=False).reset_index(drop=True)

if 'out' in globals():
    df = out.copy()
elif 'team_job_compare' in globals():
    df = team_job_compare.copy()
else:
    raise NameError("Couldn't find the comparison DataFrame (`out` or `team_job_compare`).")

# ensure JobPct columns exist (as proportions 0..1)
if "JobPct" not in df.columns or "JobPct_new" not in df.columns:
    if {"Success","Success_new","Attempts"}.issubset(df.columns):
        df["JobPct"]     = np.where(df["Attempts"]>0, df["Success"]     / df["Attempts"], np.nan)
        df["JobPct_new"] = np.where(df["Attempts"]>0, df["Success_new"] / df["Attempts"], np.nan)
    else:
        raise KeyError("Need JobPct/JobPct_new or (Success, Success_new, Attempts) to compute them.")

# regression coefficient for job_pct (wins per +1.00 = +100pp)
JOB_COEF = 14.7541

df["Season Wins Added"] = JOB_COEF * (df["JobPct_new"] - df["JobPct"])
df["Season Wins Added"] = df["Season Wins Added"].round(2)

# (optional) sort by wins added
df = df.sort_values("Season Wins Added", ascending=False).reset_index(drop=True)
df.index = df.index + 1

df
Out[238]:
Team Attempts Success Success_new JobPct JobPct_new Delta_success Season Wins Added
1 PHI 476 283 296 0.594538 0.621849 13 0.40
2 PIT 443 243 254 0.548533 0.573363 11 0.37
3 HOU 409 210 219 0.513447 0.535452 9 0.32
4 LAA 378 189 196 0.500000 0.518519 7 0.27
5 WSH 466 258 266 0.553648 0.570815 8 0.25
6 BOS 523 265 273 0.506692 0.521989 8 0.23
7 SD 471 275 282 0.583864 0.598726 7 0.22
8 KC 428 245 251 0.572430 0.586449 6 0.21
9 CIN 492 272 278 0.552846 0.565041 6 0.18
10 CWS 444 233 238 0.524775 0.536036 5 0.17
11 SEA 471 239 244 0.507431 0.518047 5 0.16
12 ATL 482 257 262 0.533195 0.543568 5 0.15
13 ATH 407 218 222 0.535627 0.545455 4 0.15
14 CLE 418 241 245 0.576555 0.586124 4 0.14
15 BAL 410 224 228 0.546341 0.556098 4 0.14
16 COL 368 197 200 0.535326 0.543478 3 0.12
17 AZ 516 305 309 0.591085 0.598837 4 0.11
18 TB 491 279 282 0.568228 0.574338 3 0.09
19 LAD 444 250 252 0.563063 0.567568 2 0.07
20 TEX 394 204 206 0.517766 0.522843 2 0.07
21 TOR 488 297 299 0.608607 0.612705 2 0.06
22 CHC 491 279 279 0.568228 0.568228 0 0.00
23 MIL 496 291 290 0.586694 0.584677 -1 -0.03
24 MIN 426 212 211 0.497653 0.495305 -1 -0.03
25 NYM 422 242 240 0.573460 0.568720 -2 -0.07
26 SF 460 262 259 0.569565 0.563043 -3 -0.10
27 MIA 484 279 274 0.576446 0.566116 -5 -0.15
28 STL 488 258 253 0.528689 0.518443 -5 -0.15
29 DET 427 217 211 0.508197 0.494145 -6 -0.21
30 NYY 490 283 274 0.577551 0.559184 -9 -0.27

Get stats of batted balls in 2 strike counts

In [239]:
pd.set_option("display.width", 200)       # widen display
pd.set_option("display.max_columns", 20)  # show all columns

# --- 1. Filter for balls hit in play ---
in_play = two_strikes[two_strikes["description"] == "hit_into_play"].copy()

# --- 2. Basic stats ---
PA = len(in_play)
AB = PA
H = in_play["events"].isin(["single", "double", "triple", "home_run"]).sum()
BB = in_play["events"].eq("walk").sum()
HBP = in_play["events"].eq("hit_by_pitch").sum()
SF = in_play["events"].eq("sac_fly").sum()
TB = in_play["events"].map({
    "single": 1,
    "double": 2,
    "triple": 3,
    "home_run": 4
}).fillna(0).sum()

# --- 3. Rate stats ---
OBP = (H + BB + HBP) / (AB + BB + HBP + SF) if (AB + BB + HBP + SF) > 0 else np.nan
SLG = TB / AB if AB > 0 else np.nan
OPS = OBP + SLG if pd.notna(OBP) and pd.notna(SLG) else np.nan

# --- 4. Output nicely ---
summary = pd.DataFrame([{
    "Spec": "Balls hit into play(2 Strikes)",
    "AB": AB,
    "OBP": round(OBP, 3),
    "SLG": round(SLG, 3),
    "OPS": round(OPS, 3)
}])


summary
Out[239]:
Spec AB OBP SLG OPS
0 Balls hit into play(2 Strikes) 5392 0.308 0.514 0.823

Read in playoff data set

In [240]:
playoffs = pd.read_csv("playoffs.csv")

Sort playoff data set

In [241]:
# Make a copy (optional but safe)
playoffs_sorted = playoffs.copy()

# 1) Convert columns to proper types for sorting
playoffs_sorted["game_date"] = pd.to_datetime(playoffs_sorted["game_date"], errors="coerce")
playoffs_sorted["game_pk"] = pd.to_numeric(playoffs_sorted["game_pk"], errors="coerce")
playoffs_sorted["at_bat_number"] = pd.to_numeric(playoffs_sorted["at_bat_number"], errors="coerce")

# 2) Sort properly:
#    Earliest date → game → at-bat order
playoffs_sorted = playoffs_sorted.sort_values(
    by=["game_date", "game_pk", "at_bat_number"],
    ascending=[True, True, True],
    kind="mergesort"  # stable sort = preserves correct internal order
).reset_index(drop=True)

playoffs_sorted
Out[241]:
pitch_type game_date release_speed release_pos_x release_pos_z player_name batter pitcher events description ... batter_days_until_next_game api_break_z_with_gravity api_break_x_arm api_break_x_batter_in arm_angle attack_angle attack_direction swing_path_tilt intercept_ball_minus_batter_pos_x_inches intercept_ball_minus_batter_pos_y_inches
0 FF 2022-10-07 94.5 -3.14 5.66 Profar, Jurickson 595777 453286 single hit_into_play ... 1.0 1.15 1.23 -1.23 29.3 NaN NaN NaN NaN NaN
1 FC 2022-10-07 89.2 -3.15 5.39 Soto, Juan 665742 453286 strikeout swinging_strike ... 1.0 2.11 -0.12 0.12 24.3 NaN NaN NaN NaN NaN
2 SL 2022-10-07 85.4 -3.19 5.48 Machado, Manny 592518 453286 field_out hit_into_play ... 1.0 2.92 -0.55 -0.55 26.6 NaN NaN NaN NaN NaN
3 FF 2022-10-07 95.4 -3.03 5.63 Bell, Josh 605137 453286 home_run hit_into_play ... 1.0 1.18 0.74 -0.74 33.0 NaN NaN NaN NaN NaN
4 CH 2022-10-07 84.9 -3.28 5.42 Cronenworth, Jake 630105 453286 strikeout swinging_strike ... 1.0 2.74 1.20 -1.20 27.6 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9292 CU 2024-10-30 75.5 -1.14 6.05 Edman, Tommy 669242 643410 strikeout swinging_strike ... NaN 5.52 -0.60 0.60 54.4 23.232530 -38.572875 31.194750 31.515521 57.721184
9293 ST 2024-10-30 79.4 -1.48 5.81 Smith, Will 669257 643410 field_out hit_into_play ... NaN 4.20 -0.91 -0.91 45.7 NaN NaN NaN NaN NaN
9294 KC 2024-10-30 77.6 -1.08 5.75 Volpe, Anthony 683011 621111 field_out hit_into_play ... NaN 5.33 -1.08 -1.08 53.9 16.663536 -14.289227 22.537581 37.696324 36.717750
9295 KC 2024-10-30 77.4 -1.23 5.78 Wells, Austin 669224 621111 strikeout swinging_strike ... NaN 5.20 -1.08 1.08 50.0 22.643800 -12.035832 32.683497 40.578398 40.302028
9296 KC 2024-10-30 77.5 -1.11 5.65 Verdugo, Alex 657077 621111 strikeout swinging_strike_blocked ... NaN 5.23 -1.08 1.08 53.2 25.834292 -25.475081 32.831211 34.481443 57.315365

9297 rows × 118 columns

Add job opportunity column

In [242]:
# Make a copy to be safe (optional)
playoffs_sorted = playoffs_sorted.copy()

# Convert outs column just to be sure it's numeric
playoffs_sorted["outs_when_up"] = pd.to_numeric(playoffs_sorted["outs_when_up"], errors="coerce")

# Create boolean flags for base states
runner_on_3rd = playoffs_sorted["on_3b"].notna()
runner_on_2nd = playoffs_sorted["on_2b"].notna()

# Apply the job opportunity definition
playoffs_sorted["job_opportunity"] = (
    ((runner_on_3rd) & (playoffs_sorted["outs_when_up"].isin([0, 1])))
    |
    ((runner_on_2nd) & (playoffs_sorted["outs_when_up"] == 0))
)

Add success/fail column

In [243]:
import numpy as np
import pandas as pd

# Work on a copy (optional)
playoffs_sorted = playoffs_sorted.copy()

# Ensure needed types
playoffs_sorted["game_date"] = pd.to_datetime(playoffs_sorted["game_date"], errors="coerce")
playoffs_sorted["game_pk"] = pd.to_numeric(playoffs_sorted["game_pk"], errors="coerce")
playoffs_sorted["at_bat_number"] = pd.to_numeric(playoffs_sorted["at_bat_number"], errors="coerce")
playoffs_sorted["outs_when_up"] = pd.to_numeric(playoffs_sorted["outs_when_up"], errors="coerce")

# Helper flags for job eligibility (your official spec)
r3_job = playoffs_sorted["on_3b"].notna() & playoffs_sorted["outs_when_up"].isin([0, 1])
r2_job = playoffs_sorted["on_2b"].notna() & (playoffs_sorted["outs_when_up"] == 0) & (~r3_job)  # 3B takes precedence
playoffs_sorted["job_opportunity"] = r3_job | r2_job  # in case it's not already present

# We need the "next AB in the same half-inning" to infer outs after this AB.
# Same half-inning grouping key:
grp_keys = ["game_pk", "inning", "inning_topbot"]

# Next-row fields within same half-inning
playoffs_sorted = playoffs_sorted.sort_values(
    by=["game_date", "game_pk", "at_bat_number"],
    kind="mergesort"
).reset_index(drop=True)

# Compute next AB context within same half-inning (shift -1 per group)
for col in ["outs_when_up", "on_2b", "on_3b"]:
    playoffs_sorted[f"next_{col}"] = (
        playoffs_sorted.groupby(grp_keys, sort=False)[col].shift(-1)
    )

# If there is no next AB in this half-inning, the inning ended -> outs_after = 3
same_half_has_next = playoffs_sorted.groupby(grp_keys)["at_bat_number"].shift(-1).notna()

outs_before = playoffs_sorted["outs_when_up"]
outs_after = np.where(
    same_half_has_next,
    playoffs_sorted["next_outs_when_up"],
    3  # inning ended; outs after must be 3
)

# Outs recorded on this AB (floor at 0)
outs_recorded = np.maximum((outs_after - outs_before).fillna(0), 0)
playoffs_sorted["outs_recorded"] = outs_recorded

# Runs scored by the batting team on this AB
# (Statcast convention: use batting team score deltas on the row)
runs_scored = (playoffs_sorted["post_bat_score"] - playoffs_sorted["bat_score"]).fillna(0)
playoffs_sorted["runs_scored_bat"] = runs_scored

# Did runner from 3rd score?  -> proxy: any run for batting team on this AB
r3_success = runs_scored.gt(0)

# Did runner from 2nd reach 3rd (or score) with <=1 out after?
# We detect "runner reached 3rd" via next on_3b at start of the next AB in SAME half-inning,
# and we treat scoring as success (cannot track individual runner IDs with given columns).
runner_on_3rd_after = (
    same_half_has_next & playoffs_sorted["next_on_3b"].notna()
)

# Success condition for R2 job:
#   - (reached 3rd AFTER) OR (scored)
#   - AND outs_after <= 1 (i.e., still 0 or 1 out after the AB)
r2_success = (runner_on_3rd_after | runs_scored.gt(0)) & (outs_after <= 1)

# Fail / Neutral logic per your rules:

# Runner on 3rd jobs:
#   Success: runs_scored > 0
#   Fail:    outs_recorded > 0 AND runs_scored == 0
#   Neutral: outs_recorded == 0 AND runs_scored == 0
r3_fail    = outs_recorded.gt(0) & runs_scored.eq(0)
r3_neutral = outs_recorded.eq(0) & runs_scored.eq(0)

# Runner on 2nd jobs (0 outs only by spec):
#   Success: (to 3rd OR scored) AND outs_after <= 1
#   Fail:    outs_recorded > 0 AND NOT success
#   Neutral: outs_recorded == 0 AND NOT success
r2_fail    = outs_recorded.gt(0) & (~r2_success)
r2_neutral = outs_recorded.eq(0) & (~r2_success)

# Initialize job_result as NA
job_result = pd.Series(pd.NA, index=playoffs_sorted.index, dtype="string")

# Assign 3B jobs first (precedence)
job_result = job_result.mask(r3_job & r3_success, "Success")
job_result = job_result.mask(r3_job & r3_fail,    "Fail")
job_result = job_result.mask(r3_job & r3_neutral, "Neutral")

# Assign 2B jobs where 3B job is not active
job_result = job_result.mask(r2_job & r2_success, "Success")
job_result = job_result.mask(r2_job & r2_fail,    "Fail")
job_result = job_result.mask(r2_job & r2_neutral, "Neutral")

# Leave non-job rows as <NA> (or set to "Non-Job" if you prefer)
playoffs_sorted["job_result"] = job_result

# Optional sanity checks:
# 1) Ensure all job_opportunity rows received a label
# print(playoffs_sorted.loc[playoffs_sorted["job_opportunity"] & playoffs_sorted["job_result"].isna(),
#                           ["game_pk","inning","inning_topbot","at_bat_number","outs_when_up",
#                            "bat_score","post_bat_score"]].head())

Add batting team column

In [244]:
playoffs_sorted = playoffs_sorted.copy()

# Standardize the inning_topbot column just in case
playoffs_sorted["inning_topbot"] = playoffs_sorted["inning_topbot"].str.lower()

# Create batting_team
playoffs_sorted["batting_team"] = np.where(
    playoffs_sorted["inning_topbot"].str.startswith("top"),
    playoffs_sorted["away_team"],
    playoffs_sorted["home_team"]
)

Run game level regression on the effect of job opportunity for playoff games

In [245]:
df = playoffs_sorted.copy()
df["game_pk"] = pd.to_numeric(df["game_pk"], errors="coerce")
df["at_bat_number"] = pd.to_numeric(df["at_bat_number"], errors="coerce")
df = df.sort_values(["game_date","game_pk","at_bat_number"], kind="mergesort").reset_index(drop=True)

# ---------- 1) Final score per game ----------
# Take the last AB in each game to read final score
final_scores = (
    df.groupby("game_pk", as_index=False)
      .tail(1)[["game_pk","home_team","away_team","post_home_score","post_away_score"]]
      .rename(columns={"post_home_score":"final_home","post_away_score":"final_away"})
)

# ---------- 2) Game-level JOB% per team (job opps only; ignore Neutral) ----------
job_rows = df[(df["job_opportunity"] == True) & (df["job_result"].isin(["Success","Fail"]))]

job_game = (
    job_rows.groupby(["game_pk","batting_team"])["job_result"]
            .value_counts()
            .unstack(fill_value=0)[["Success","Fail"]]
            .assign(
                opps=lambda d: d["Success"] + d["Fail"],
                job_pct=lambda d: np.where(d["opps"]>0, d["Success"]/d["opps"], np.nan)
            )
            .reset_index()
)

# ---------- 3) OBP_game & SLG_game from ALL ABs for that batting team in that game ----------
# Derive components from Statcast 'events'
events = df.copy()
ev = events["events"].astype(str).str.lower()

hit_events = {"single","double","triple","home_run"}
bb_events  = {"walk","intent_walk"}
hbp_events = {"hit_by_pitch"}
sf_events  = {"sac_fly","sac_fly_double_play"}
sh_events  = {"sac_bunt","sac_bunt_double_play"}
no_ab_events = bb_events | hbp_events | sf_events | sh_events | {"catcher_interference","catcher_interf"}

events["H"]   = ev.isin(hit_events).astype(int)
events["BB"]  = ev.isin(bb_events).astype(int)
events["HBP"] = ev.isin(hbp_events).astype(int)
events["SF"]  = ev.isin(sf_events).astype(int)
events["SH"]  = ev.isin(sh_events).astype(int)
# AB credited for most outs/balls in play that are not in no-AB set
events["AB"]  = (~ev.isin(no_ab_events)).astype(int)

# Total bases
tb_map = {"single":1,"double":2,"triple":3,"home_run":4}
events["TB"] = ev.map(tb_map).fillna(0).astype(int)

# Aggregate by team-game
bat_game = (
    events.groupby(["game_pk","batting_team"], as_index=False)
          .agg(H=("H","sum"), BB=("BB","sum"), HBP=("HBP","sum"), AB=("AB","sum"), SF=("SF","sum"), TB=("TB","sum"))
)

# OBP = (H + BB + HBP) / (AB + BB + HBP + SF)
# SLG = TB / AB
bat_game["PA_for_OBP"] = bat_game["AB"] + bat_game["BB"] + bat_game["HBP"] + bat_game["SF"]
bat_game["OBP_game"] = np.where(bat_game["PA_for_OBP"]>0,
                                (bat_game["H"] + bat_game["BB"] + bat_game["HBP"]) / bat_game["PA_for_OBP"],
                                np.nan)
bat_game["SLG_game"] = np.where(bat_game["AB"]>0, bat_game["TB"] / bat_game["AB"], np.nan)

# ---------- 4) Merge scores & compute is_home, runs_allowed, win ----------
# First, attach home/away labels for each team-game
teams = df[["game_pk","home_team","away_team"]].drop_duplicates()

game_team = (
    bat_game.merge(teams, on="game_pk", how="left")
            .merge(final_scores, on=["game_pk","home_team","away_team"], how="left")
)

game_team["is_home"] = (game_team["batting_team"] == game_team["home_team"]).astype(int)

# runs_allowed is opponent's final runs
game_team["runs_allowed"] = np.where(game_team["is_home"]==1, game_team["final_away"], game_team["final_home"])

# team final runs (for win flag)
game_team["runs_scored"] = np.where(game_team["is_home"]==1, game_team["final_home"], game_team["final_away"])
game_team["win"] = (game_team["runs_scored"] > game_team["runs_allowed"]).astype(int)

# ---------- 5) Add game Job% (merge) ----------
game_team = game_team.merge(job_game[["game_pk","batting_team","job_pct"]],
                            on=["game_pk","batting_team"], how="left")

# Optional: drop games with no job opps (job_pct NaN) or keep them as missing
# Common approach: drop rows with any NaNs needed for regression
reg_df = game_team[["win","job_pct","OBP_game","SLG_game","runs_allowed","is_home"]].dropna().reset_index(drop=True)

# ---------- 6) Run OLS ----------
import statsmodels.api as sm
X = reg_df[["job_pct","OBP_game","SLG_game","runs_allowed","is_home"]]
X = sm.add_constant(X)
y = reg_df["win"]

model = sm.OLS(y, X).fit()
print(model.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    win   R-squared:                       0.594
Model:                            OLS   Adj. R-squared:                  0.584
Method:                 Least Squares   F-statistic:                     57.91
Date:                Wed, 22 Oct 2025   Prob (F-statistic):           6.30e-37
Time:                        01:29:14   Log-Likelihood:                -55.901
No. Observations:                 204   AIC:                             123.8
Df Residuals:                     198   BIC:                             143.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0245      0.108     -0.226      0.821      -0.238       0.189
job_pct          0.1678      0.069      2.446      0.015       0.032       0.303
OBP_game         1.7684      0.434      4.077      0.000       0.913       2.624
SLG_game         0.8922      0.193      4.633      0.000       0.512       1.272
runs_allowed    -0.1022      0.008    -12.544      0.000      -0.118      -0.086
is_home         -0.0416      0.046     -0.897      0.371      -0.133       0.050
==============================================================================
Omnibus:                       16.298   Durbin-Watson:                   2.538
Prob(Omnibus):                  0.000   Jarque-Bera (JB):                6.434
Skew:                           0.143   Prob(JB):                       0.0401
Kurtosis:                       2.178   Cond. No.                         101.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.