# Initialize necesarry libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import scale
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error
import matplotlib.pyplot as plt


# Extract zip file from github

%cd /content
!git clone https://github.com/lalbright22/DataScience2023.git
%cd /content/DataScience2023/

zip_file_path = '/content/DataScience2023/baseballdatabank-2023.1.zip'
extract_dir = '/content/DataScience2023/'

# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents into the target directory
    zip_ref.extractall(extract_dir)

/content
fatal: destination path 'DataScience2023' already exists and is not an empty directory.
/content/DataScience2023


%cd baseballdatabank-2023.1/contrib

# Upload salaries dataframe
salaries = pd.read_csv("Salaries.csv")
salaries.head()

/content/DataScience2023/baseballdatabank-2023.1/contrib


%cd ../core

# Downloads teams dataframe
teams = pd.read_csv("Teams.csv")
teams

/content/DataScience2023/baseballdatabank-2023.1/core


# Sum salaries by team and year (league stays the same in all cases) in order to merge with teams dataset
team_salaries = salaries.groupby(["teamID", "yearID"])["salary"].sum().reset_index() #Groups and sums the player salaries by team.
team_salaries


# Sum salaries by team and year (league stays the same in all cases) in order to merge with teams dataset
teams["differential"] = teams['R'] - teams['RA']


# Calculate batting average by the formula below
teams["AVG"] = (teams['H']) / (teams['AB'])


# Calculate on base percentage by the formula below
teams["OBP"] = (teams['H'] + teams['BB'] + teams['HBP']) / (teams['AB'] + teams['BB'] + teams['HBP'] + teams['SF'])


# Calculate singles by the formula below
teams["1B"] = teams['H'] - teams['2B'] - teams['3B'] - teams['HR']


# Calculate slugging percentage by the formula below
teams["SLG"] = (teams['1B'] + 2*teams['2B'] + 3*teams['3B'] + 4*teams['HR'])/teams['AB']
teams


# Dropping unecessary columns
teams.drop(columns=["teamIDBR", "teamIDlahman45", "teamIDretro", "IPouts"], inplace=True)


# This results in a dataframe with statistics for each team and year, including total salary, wins, attendance, batting and pitching park factors, and more.
baseball = pd.merge(team_salaries, teams, on=["teamID", "yearID"])


baseball = baseball[(baseball['yearID'] >= 2004) & (baseball['yearID'] <= 2016)]


baseball


baseball.columns

Index(['teamID', 'yearID', 'salary', 'lgID', 'franchID', 'divID', 'Rank', 'G',
       'Ghome', 'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H',
       '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER',
       'ERA', 'CG', 'SHO', 'SV', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP',
       'name', 'park', 'attendance', 'BPF', 'PPF', 'differential', 'AVG',
       'OBP', '1B', 'SLG'],
      dtype='object')


baseball_nl = baseball[baseball['lgID'] == 'NL']
baseball_al = baseball[baseball['lgID'] == 'AL']

# Plot for National League
plt.figure(figsize=(12, 8)) #Adjust the figure size
sns.scatterplot(data=baseball_nl, x='yearID', y='W', hue='teamID', palette='plasma', edgecolor='w', s=100)
plt.title('W by AL Teams Each Year')
plt.xlabel('Year')
plt.ylabel('W')
plt.legend(title='teamID', bbox_to_anchor=(1.05, 1), loc='upper left')  # Adjust legend position
# Add a horizontal line at the mean # of wins
plt.axhline(81, color='green', linestyle='--', label='Threshold (81)')
plt.show()

# Plot for American League
plt.figure(figsize=(12, 8)) #Adjust the figure size
sns.scatterplot(data=baseball_al, x='yearID', y='W', hue='teamID', palette='plasma', edgecolor='w', s=100)
plt.title('W by AL Teams Each Year')
plt.xlabel('Year')
plt.ylabel('W')
plt.legend(title='teamID', bbox_to_anchor=(1.05, 1), loc='upper left')  # Adjust legend position
# Add a horizontal line at the mean # of wins
plt.axhline(81, color='green', linestyle='--', label='Threshold (81)')
plt.show()


wins_std_by_league_and_year = baseball.groupby(['lgID', 'yearID'])['W'].std()

print("Standard Deviation of Wins by League and Year:")
print(wins_std_by_league_and_year)

Standard Deviation of Wins by League and Year:
lgID  yearID
AL    2004      13.473621
      2005      13.070393
      2006      12.000000
      2007      10.885266
      2008      11.338207
      2009      12.162137
      2010      11.640211
      2011      11.432795
      2012      10.967484
      2013      13.704361
      2014       9.927355
      2015       7.225616
      2016      10.511672
NL    2004      14.051690
      2005       8.830817
      2006       8.035079
      2007       7.957177
      2008      11.002841
      2009      11.094856
      2010      10.776518
      2011      11.763645
      2012      12.992305
      2013      11.093542
      2014       9.523405
      2015      13.097801
      2016      11.089678
Name: W, dtype: float64


sns.heatmap(baseball[['W', 'BPF', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO']].corr(), annot = True, cmap = 'coolwarm', fmt='.2f')

<Axes: >


sns.heatmap(baseball[['W', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'CG', 'SHO']].corr(), annot = True, cmap = 'coolwarm', fmt='.2f')

<Axes: >


sns.heatmap(baseball[['W', 'SV', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'attendance']].corr(), annot = True, cmap = 'coolwarm', fmt='.2f')

<Axes: >


sns.heatmap(baseball[['W', 'PPF', 'differential', 'AVG', 'OBP', '1B', 'SLG', 'salary']].corr(), annot = True, cmap = 'coolwarm', fmt='.2f')

<Axes: >


diff_v_win = sns.regplot(x='differential', y='W', data=baseball, scatter_kws={'s':10})
diff_v_win.set(xlabel='Run Differential', ylabel='Wins')

[Text(0.5, 0, 'Run Differential'), Text(0, 0.5, 'Wins')]


mod = smf.ols(formula='W ~ differential', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.858
Model:                            OLS   Adj. R-squared:                  0.858
Method:                 Least Squares   F-statistic:                     2343.
Date:                Fri, 15 Dec 2023   Prob (F-statistic):          1.68e-166
Time:                        05:39:20   Log-Likelihood:                -1105.0
No. Observations:                 390   AIC:                             2214.
Df Residuals:                     388   BIC:                             2222.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       80.9795      0.209    387.757      0.000      80.569      81.390
differential     0.1008      0.002     48.405      0.000       0.097       0.105
==============================================================================
Omnibus:                        6.362   Durbin-Watson:                   1.819
Prob(Omnibus):                  0.042   Jarque-Bera (JB):                6.148
Skew:                           0.288   Prob(JB):                       0.0462
Kurtosis:                       3.216   Cond. No.                         100.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


on_base = sns.regplot(x='OBP', y='W', data=baseball, scatter_kws={'s':10})
on_base.set(xlabel='On Base Percentage', ylabel='Wins')

[Text(0.5, 0, 'On Base Percentage'), Text(0, 0.5, 'Wins')]


mod = smf.ols(formula='W ~ OBP', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.212
Method:                 Least Squares   F-statistic:                     105.9
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           4.00e-22
Time:                        05:39:20   Log-Likelihood:                -1438.4
No. Observations:                 390   AIC:                             2881.
Df Residuals:                     388   BIC:                             2889.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -37.8732     11.559     -3.277      0.001     -60.599     -15.147
OBP          364.7135     35.438     10.292      0.000     295.039     434.388
==============================================================================
Omnibus:                        9.409   Durbin-Watson:                   1.224
Prob(Omnibus):                  0.009   Jarque-Bera (JB):                6.149
Skew:                          -0.156   Prob(JB):                       0.0462
Kurtosis:                       2.470   Cond. No.                         79.8
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


sns.regplot(x='salary', y='W', data=baseball, scatter_kws={'s':20})

<Axes: xlabel='salary', ylabel='W'>


mod = smf.ols(formula='W ~ salary', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.136
Model:                            OLS   Adj. R-squared:                  0.134
Method:                 Least Squares   F-statistic:                     60.95
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           5.51e-14
Time:                        05:39:21   Log-Likelihood:                -1457.0
No. Observations:                 390   AIC:                             2918.
Df Residuals:                     388   BIC:                             2926.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     71.8865      1.274     56.447      0.000      69.383      74.390
salary      9.759e-08   1.25e-08      7.807      0.000     7.3e-08    1.22e-07
==============================================================================
Omnibus:                       17.415   Durbin-Watson:                   1.332
Prob(Omnibus):                  0.000   Jarque-Bera (JB):                8.623
Skew:                          -0.142   Prob(JB):                       0.0134
Kurtosis:                       2.329   Cond. No.                     2.52e+08
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.52e+08. This might indicate that there are
strong multicollinearity or other numerical problems.


sns.regplot(x='BPF', y='W', data=baseball, scatter_kws={'s':20})

<Axes: xlabel='BPF', ylabel='W'>


mod = smf.ols(formula='W ~ BPF', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     3.365
Date:                Fri, 15 Dec 2023   Prob (F-statistic):             0.0674
Time:                        05:39:21   Log-Likelihood:                -1483.8
No. Observations:                 390   AIC:                             2972.
Df Residuals:                     388   BIC:                             2980.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     60.9305     10.944      5.568      0.000      39.414      82.447
BPF            0.2002      0.109      1.834      0.067      -0.014       0.415
==============================================================================
Omnibus:                       20.655   Durbin-Watson:                   1.156
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               11.233
Skew:                          -0.238   Prob(JB):                      0.00364
Kurtosis:                       2.319   Cond. No.                     1.99e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.99e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


ppf_v_win = sns.regplot(x='PPF', y='W', data=baseball, scatter_kws={'s':20})
ppf_v_win.set(xlabel='Pitchig Park Factor', ylabel='Wins')

[Text(0.5, 0, 'Pitchig Park Factor'), Text(0, 0.5, 'Wins')]


mod = smf.ols(formula='W ~ PPF', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.6023
Date:                Fri, 15 Dec 2023   Prob (F-statistic):              0.438
Time:                        05:39:22   Log-Likelihood:                -1485.2
No. Observations:                 390   AIC:                             2974.
Df Residuals:                     388   BIC:                             2982.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     89.6310     11.161      8.031      0.000      67.687     111.575
PPF           -0.0864      0.111     -0.776      0.438      -0.305       0.132
==============================================================================
Omnibus:                       23.562   Durbin-Watson:                   1.150
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               11.214
Skew:                          -0.203   Prob(JB):                      0.00367
Kurtosis:                       2.275   Cond. No.                     2.02e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.02e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


sns.regplot(x='attendance', y='W', data=baseball, scatter_kws={'s':20})

<Axes: xlabel='attendance', ylabel='W'>


mod = smf.ols(formula='W ~ attendance', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.230
Method:                 Least Squares   F-statistic:                     117.5
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           4.31e-24
Time:                        05:39:22   Log-Likelihood:                -1433.9
No. Observations:                 390   AIC:                             2872.
Df Residuals:                     388   BIC:                             2880.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     61.3386      1.876     32.698      0.000      57.650      65.027
attendance  7.885e-06   7.27e-07     10.839      0.000    6.45e-06    9.31e-06
==============================================================================
Omnibus:                        3.926   Durbin-Watson:                   1.319
Prob(Omnibus):                  0.140   Jarque-Bera (JB):                3.400
Skew:                          -0.145   Prob(JB):                        0.183
Kurtosis:                       2.646   Cond. No.                     9.97e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 9.97e+06. This might indicate that there are
strong multicollinearity or other numerical problems.


sns.regplot(x='FP', y='W', data=baseball, scatter_kws={'s':20})

<Axes: xlabel='FP', ylabel='W'>


mod = smf.ols(formula='W ~ FP', data=baseball)
results = mod.fit()
print(results.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      W   R-squared:                       0.156
Model:                            OLS   Adj. R-squared:                  0.154
Method:                 Least Squares   F-statistic:                     71.68
Date:                Fri, 15 Dec 2023   Prob (F-statistic):           5.26e-16
Time:                        05:39:23   Log-Likelihood:                -1452.4
No. Observations:                 390   AIC:                             2909.
Df Residuals:                     388   BIC:                             2917.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1633.7785    202.532     -8.067      0.000   -2031.976   -1235.581
FP          1743.1308    205.883      8.467      0.000    1338.346    2147.916
==============================================================================
Omnibus:                       14.490   Durbin-Watson:                   1.191
Prob(Omnibus):                  0.001   Jarque-Bera (JB):                7.067
Skew:                          -0.069   Prob(JB):                       0.0292
Kurtosis:                       2.355   Cond. No.                         796.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


# Select relevant features
features = ['R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'CG', 'SHO', 'SV', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'attendance', 'BPF', 'PPF', 'differential', 'AVG', 'OBP', '1B', 'SLG', 'salary']

# Split the data into features (X) and the target variable (y)
X = baseball[features]
y = baseball['W']

# Shuffle X and y to avoid data being grouped by year
X_shuffled, y_shuffled = shuffle(X, y, random_state=42)

# Split the shuffled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_shuffled, y_shuffled, test_size=0.2, random_state=42)

# Standardizing the features, a good all-purpose scaling method
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Lasso Regression with cross-validation
lasso_model = Lasso(alpha=1.0)
lasso_cv_results = cross_validate(lasso_model, X_train_scaled, y_train, cv=5, scoring={'mse': make_scorer(mean_squared_error), 'mae': make_scorer(mean_absolute_error), 'mape': 'neg_mean_absolute_percentage_error'}, return_train_score=True)

# Ridge Regression with cross-validation
ridge_model = Ridge(alpha=1.0)
ridge_cv_results = cross_validate(ridge_model, X_train_scaled, y_train, cv=5, scoring={'mse': make_scorer(mean_squared_error), 'mae': make_scorer(mean_absolute_error), 'mape': 'neg_mean_absolute_percentage_error'}, return_train_score=True)

# Outputting cross-validation results
print("Lasso Regression Cross-Validation Results:")
print("Train MSE:", lasso_cv_results['train_mse'])
print("Test MSE:", lasso_cv_results['test_mse'])
print("Train MAE:", lasso_cv_results['train_mae'])
print("Test MAE:", lasso_cv_results['test_mae'])
print("Train MAPE:", -lasso_cv_results['train_mape'].mean())
print("Test MAPE:", -lasso_cv_results['test_mape'].mean())

print("\nRidge Regression Cross-Validation Results:")
print("Train MSE:", ridge_cv_results['train_mse'])
print("Test MSE:", ridge_cv_results['test_mse'])
print("Train MAE:", ridge_cv_results['train_mae'])
print("Test MAE:", ridge_cv_results['test_mae'])
print("Train MAPE:", -ridge_cv_results['train_mape'].mean())
print("Test MAPE:", -ridge_cv_results['test_mape'].mean())

# Fit final models on the entire training set
lasso_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred_lasso = lasso_model.predict(X_test_scaled)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Evaluating performance on the test set
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
mape_lasso = mean_absolute_error(y_test, y_pred_lasso) / y_test.mean() * 100
mape_ridge = mean_absolute_error(y_test, y_pred_ridge) / y_test.mean() * 100

# Outputting additional metrics
print("\nLasso Regression Metrics on Test Set:")
print("Test MSE:", mse_lasso)
print("Test MAE:", mae_lasso)
print("Test MAPE:", mape_lasso)

print("\nRidge Regression Metrics on Test Set:")
print("Test MSE:", mse_ridge)
print("Test MAE:", mae_ridge)
print("Test MAPE:", mape_ridge)

# Calculating coefficients
linear_coefficients = pd.Series(lasso_model.coef_, index=features, name='Lasso Coefficients')
ridge_coefficients = pd.Series(ridge_model.coef_, index=features, name='Ridge Coefficients')

# Outputting the coefficients
print("\nLasso Regression Coefficients:")
print(linear_coefficients)

print("\nRidge Regression Coefficients:")
print(ridge_coefficients)

# Plotting the coefficients
plt.figure(figsize=(12, 6))
linear_coefficients.plot(kind='bar', color='blue', alpha=0.7, label='Lasso Regression')
ridge_coefficients.plot(kind='bar', color='orange', alpha=0.7, label='Ridge Regression')
plt.title('Lasso vs. Ridge Regression Coefficients')
plt.xlabel('Features')
plt.ylabel('Coefficients')
plt.legend()
plt.show()

Lasso Regression Cross-Validation Results:
Train MSE: [11.20666699 11.67411654 11.04421093 11.68885514 10.69664513]
Test MSE: [11.39210868  9.02284629 11.96660232 11.99680211 13.39725325]
Train MAE: [2.67132948 2.77871331 2.67748195 2.7707325  2.66335372]
Test MAE: [2.85383271 2.39444269 2.85633994 2.76982123 2.88783674]
Train MAPE: 0.03444431452354736
Test MAPE: 0.03493001884604956

Ridge Regression Cross-Validation Results:
Train MSE: [7.74448574 8.35934331 7.3459862  7.93124754 7.45480217]
Test MSE: [ 9.84239331  7.10438773 11.46098082  8.8855374  11.69348803]
Train MAE: [2.206153   2.27575906 2.09535439 2.2363997  2.16573858]
Test MAE: [2.43717243 2.09853272 2.76548542 2.31515811 2.69512577]
Train MAPE: 0.02773242061706115
Test MAPE: 0.031060612906844226

Lasso Regression Metrics on Test Set:
Test MSE: 13.25365719237693
Test MAE: 2.933882513923934
Test MAPE: 3.6301211308068986

Ridge Regression Metrics on Test Set:
Test MSE: 10.203421192219395
Test MAE: 2.56765965016351
Test MAPE: 3.1769900493774395

Lasso Regression Coefficients:
R               0.000000
AB              0.000000
H               0.000000
2B              0.000000
3B             -0.000000
HR              0.000000
BB              0.000000
SO             -0.000000
SB              0.000000
CS             -0.000000
HBP             0.000000
SF              0.000000
RA             -0.000000
ER             -0.000000
CG              0.000000
SHO             0.000000
SV              2.121406
HA             -0.000000
HRA            -0.000000
BBA            -0.000000
SOA             0.000000
E              -0.000000
DP             -0.000000
FP              0.000000
attendance      0.058465
BPF            -0.000000
PPF            -0.000000
differential    8.178797
AVG             0.000000
OBP             0.000000
1B              0.000000
SLG             0.000000
salary          0.000000
Name: Lasso Coefficients, dtype: float64

Ridge Regression Coefficients:
R               3.403701
AB             -0.366015
H              -0.056132
2B             -0.171711
3B             -0.474057
HR              0.078274
BB              0.389861
SO             -0.183055
SB              0.343619
CS             -0.123511
HBP             0.174036
SF             -0.157728
RA             -0.769355
ER             -1.867679
CG              0.390758
SHO             0.655094
SV              2.912473
HA              0.200086
HRA            -0.363844
BBA            -0.490196
SOA             0.633574
E               0.325810
DP             -0.017997
FP              0.701568
attendance      0.614488
BPF            -0.305166
PPF            -0.003891
differential    3.277630
AVG             0.598614
OBP             0.100328
1B              0.027911
SLG             0.617461
salary         -0.193715
Name: Ridge Coefficients, dtype: float64


print("Lasso Regression Cross-Validation Results:")
print("Train MSE:", lasso_cv_results['train_mse'])
print("Test MSE:", lasso_cv_results['test_mse'])
print("Train MAE:", lasso_cv_results['train_mae'])
print("Test MAE:", lasso_cv_results['test_mae'])
print("Train MAPE:", -lasso_cv_results['train_mape'].mean())
print("Test MAPE:", -lasso_cv_results['test_mape'].mean())

print("\nRidge Regression Cross-Validation Results:")
print("Train MSE:", ridge_cv_results['train_mse'])
print("Test MSE:", ridge_cv_results['test_mse'])
print("Train MAE:", ridge_cv_results['train_mae'])
print("Test MAE:", ridge_cv_results['test_mae'])
print("Train MAPE:", -ridge_cv_results['train_mape'].mean())
print("Test MAPE:", -ridge_cv_results['test_mape'].mean())

Lasso Regression Cross-Validation Results:
Train MSE: [11.20666699 11.67411654 11.04421093 11.68885514 10.69664513]
Test MSE: [11.39210868  9.02284629 11.96660232 11.99680211 13.39725325]
Train MAE: [2.67132948 2.77871331 2.67748195 2.7707325  2.66335372]
Test MAE: [2.85383271 2.39444269 2.85633994 2.76982123 2.88783674]
Train MAPE: 0.03444431452354736
Test MAPE: 0.03493001884604956

Ridge Regression Cross-Validation Results:
Train MSE: [7.74448574 8.35934331 7.3459862  7.93124754 7.45480217]
Test MSE: [ 9.84239331  7.10438773 11.46098082  8.8855374  11.69348803]
Train MAE: [2.206153   2.27575906 2.09535439 2.2363997  2.16573858]
Test MAE: [2.43717243 2.09853272 2.76548542 2.31515811 2.69512577]
Train MAPE: 0.02773242061706115
Test MAPE: 0.031060612906844226

	yearID	teamID	lgID	playerID	salary
0	1985	ATL	NL	barkele01	870000
1	1985	ATL	NL	bedrost01	550000
2	1985	ATL	NL	benedbr01	545000
3	1985	ATL	NL	campri01	633333
4	1985	ATL	NL	ceronri01	625000

	yearID	lgID	teamID	franchID	divID	Rank	G	Ghome	W	L	...	DP	FP	name	park	attendance	BPF	PPF	teamIDBR	teamIDlahman45	teamIDretro
0	1871	NaN	BS1	BNA	NaN	3	31	NaN	20	10	...	24	0.834	Boston Red Stockings	South End Grounds I	NaN	103	98	BOS	BS1	BS1
1	1871	NaN	CH1	CNA	NaN	2	28	NaN	19	9	...	16	0.829	Chicago White Stockings	Union Base-Ball Grounds	NaN	104	102	CHI	CH1	CH1
2	1871	NaN	CL1	CFC	NaN	8	29	NaN	10	19	...	15	0.818	Cleveland Forest Citys	National Association Grounds	NaN	96	100	CLE	CL1	CL1
3	1871	NaN	FW1	KEK	NaN	7	19	NaN	7	12	...	8	0.803	Fort Wayne Kekiongas	Hamilton Field	NaN	101	107	KEK	FW1	FW1
4	1871	NaN	NY2	NNA	NaN	5	33	NaN	16	17	...	14	0.840	New York Mutuals	Union Grounds (Brooklyn)	NaN	90	88	NYU	NY2	NY2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3010	2022	NL	SLN	STL	C	1	162	81.0	93	69	...	181	0.989	St. Louis Cardinals	Busch Stadium III	3320551.0	94	94	STL	SLN	SLN
3011	2022	AL	TBA	TBD	E	1	162	81.0	86	76	...	110	0.985	Tampa Bay Rays	Tropicana Field	1128127.0	95	93	TBR	TBA	TBA
3012	2022	AL	TEX	TEX	W	4	162	81.0	68	94	...	143	0.984	Texas Rangers	Globe Life Field	2011361.0	100	101	TEX	TEX	TEX
3013	2022	AL	TOR	TOR	E	2	162	81.0	92	70	...	120	0.986	Toronto Blue Jays	Rogers Centre	2653830.0	100	100	TOR	TOR	TOR
3014	2022	NL	WAS	WSN	E	5	162	81.0	55	107	...	126	0.982	Washington Nationals	Nationals Park	2026401.0	94	96	WSN	MON	WAS

	teamID	yearID	salary
0	ANA	1997	31135472
1	ANA	1998	41281000
2	ANA	1999	55388166
3	ANA	2000	51464167
4	ANA	2001	47535167
...	...	...	...
913	WAS	2012	80855143
914	WAS	2013	113703270
915	WAS	2014	131983680
916	WAS	2015	155587472
917	WAS	2016	141652646

	yearID	lgID	teamID	franchID	divID	Rank	G	Ghome	W	L	...	BPF	PPF	teamIDBR	teamIDlahman45	teamIDretro	differential	AVG	OBP	1B	SLG
0	1871	NaN	BS1	BNA	NaN	3	31	NaN	20	10	...	103	98	BOS	BS1	BS1	98	0.310496	NaN	316	0.422012
1	1871	NaN	CH1	CNA	NaN	2	28	NaN	19	9	...	104	102	CHI	CH1	CH1	61	0.270067	NaN	240	0.373746
2	1871	NaN	CL1	CFC	NaN	8	29	NaN	10	19	...	96	100	CLE	CL1	CL1	-92	0.276560	NaN	246	0.391231
3	1871	NaN	FW1	KEK	NaN	7	19	NaN	7	12	...	101	107	KEK	FW1	FW1	-106	0.238606	NaN	149	0.293566
4	1871	NaN	NY2	NNA	NaN	5	33	NaN	16	17	...	90	88	NYU	NY2	NY2	-11	0.287037	NaN	338	0.349715
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3010	2022	NL	SLN	STL	C	1	162	81.0	93	69	...	94	94	STL	SLN	SLN	135	0.252183	0.325268	878	0.420124
3011	2022	AL	TBA	TBD	E	1	162	81.0	86	76	...	95	93	TBR	TBA	TBA	52	0.239098	0.308500	842	0.377125
3012	2022	AL	TEX	TEX	W	4	162	81.0	68	94	...	100	101	TEX	TEX	TEX	-36	0.238773	0.300881	866	0.395400
3013	2022	AL	TOR	TOR	E	2	162	81.0	92	70	...	100	100	TOR	TOR	TOR	96	0.263546	0.328667	945	0.431143
3014	2022	NL	WAS	WSN	E	5	162	81.0	55	107	...	94	96	WSN	MON	WAS	-252	0.248620	0.310229	943	0.377438

	teamID	yearID	salary	lgID	franchID	divID	Rank	G	Ghome	W	...	name	park	attendance	BPF	PPF	differential	AVG	OBP	1B	SLG
7	ANA	2004	100534667	AL	ANA	W	1	162	81.0	92	...	Anaheim Angels	Angels Stadium of Anaheim	3375677.0	97	97	102	0.282467	0.340760	1132	0.429075
14	ARI	2004	69780750	NL	ARI	W	5	162	81.0	51	...	Arizona Diamondbacks	Bank One Ballpark	2519560.0	105	107	-284	0.252706	0.309889	933	0.392677
15	ARI	2005	62329166	NL	ARI	W	2	162	81.0	77	...	Arizona Diamondbacks	Bank One Ballpark	2059424.0	103	105	-160	0.255676	0.332481	910	0.421081
16	ARI	2006	59684226	NL	ARI	W	4	162	81.0	76	...	Arizona Diamondbacks	Chase Field	2091685.0	105	105	-15	0.266785	0.331313	977	0.423915
17	ARI	2007	52067546	NL	ARI	W	1	162	81.0	90	...	Arizona Diamondbacks	Chase Field	2325249.0	107	107	-20	0.250093	0.320761	853	0.412931
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
913	WAS	2012	80855143	NL	WSN	E	1	162	81.0	98	...	Washington Nationals	Nationals Park	2370794.0	99	101	137	0.261443	0.322152	948	0.427605
914	WAS	2013	113703270	NL	WSN	E	2	162	81.0	86	...	Washington Nationals	Nationals Park	2652422.0	102	101	30	0.251104	0.312594	918	0.397535
915	WAS	2014	131983680	NL	WSN	E	1	162	81.0	96	...	Washington Nationals	Nationals Park	2579389.0	104	102	131	0.253158	0.320988	959	0.392999
916	WAS	2015	155587472	NL	WSN	E	2	162	81.0	83	...	Washington Nationals	Nationals Park	2619843.0	102	99	68	0.251105	0.321016	908	0.402542
917	WAS	2016	141652646	NL	WSN	E	1	162	81.0	95	...	Washington Nationals	Nationals Park	2481938.0	100	98	151	0.255556	0.325532	903	0.425865

Final Tutorial - Keys to Success in Major League Baseball¶

Introduction¶

About the Data:¶

ETL¶

Data Preprocessing¶

Here is a helpful glossary containing the definitions for all of the variables in the merged dataframe.¶

EDA¶

Model¶

Conclusion¶

Important Links¶