import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
from tensorflow.keras.utils import to_categorical
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Displaying df with all columns available
pd.set_option("display.max_columns",None)
# Reading and storing in data csv file
data_df = pd.read_csv("../../Data/Team_Stats/2000-2019_next_year_points.csv")
print(data_df.shape)
# Displaying dataframe
data_df.head()
# Finding columns with null values
data_df.isnull().sum()
# Dropping null columns
no_null_df = data_df.dropna(how="any")
print(no_null_df.shape)
no_null_df.head()
# Storing next years win loss ratio in the targets variable
targets = no_null_df.loc[:,"NY PF_Off"].values
# Displaying the targets
targets[:10]
# Visualizing the distribution of the target data
plt.figure(figsize=(15,10))
sns.distplot(targets)
plt.xlabel("Offensive Points", labelpad=14)
plt.ylabel("Frequency of occurence", labelpad=14)
plt.title("Distribution of Next-year Offensive Points", y=1.015, fontsize=20)
# Storing the inputs
inputs = no_null_df.drop(columns=["NY PF_Off","NY PF_Def","Games","Year","Team"])
print(inputs.shape)
# Storing the input column names
feature_names = inputs.columns
feature_names
# Using a forward selection algorithm to determine signifcant features
def forward_selection(data, target, significance_level=0.1):
data = pd.DataFrame(data, columns=feature_names)
target = pd.DataFrame(target)
initial_features = data.columns.tolist()
best_features = []
while (len(initial_features)>0):
remaining_features = list(set(initial_features)-set(best_features))
new_pval = pd.Series(index=remaining_features)
for new_column in remaining_features:
model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
new_pval[new_column] = model.pvalues[new_column]
min_p_value = new_pval.min()
if(min_p_value<significance_level):
best_features.append(new_pval.idxmin())
else:
break
return best_features
# Storing forward selection significant features and displaying them
forward_selection_list = forward_selection(inputs, targets)
forward_selection_list
# Using a backward elimination algorithm to determine signifcant features
def backward_elimination(data, target,significance_level = 0.1):
data = pd.DataFrame(data,columns=feature_names)
features = data.columns.tolist()
while(len(features)>0):
features_with_constant = sm.add_constant(data[features])
p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
max_p_value = p_values.max()
if(max_p_value >= significance_level):
excluded_feature = p_values.idxmax()
features.remove(excluded_feature)
else:
break
return features
# Storing backward elimination significant features and displaying them
backward_elimination_list = backward_elimination(inputs, targets)
backward_elimination_list
# Using a stepwise selection algorithm to determine signifcant features
def stepwise_selection(data, target,SL_in=0.05,SL_out = 0.1):
data = pd.DataFrame(data, columns=feature_names)
initial_features = data.columns.tolist()
best_features = []
while (len(initial_features)>0):
remaining_features = list(set(initial_features)-set(best_features))
new_pval = pd.Series(index=remaining_features)
for new_column in remaining_features:
model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
new_pval[new_column] = model.pvalues[new_column]
min_p_value = new_pval.min()
if(min_p_value<SL_in):
best_features.append(new_pval.idxmin())
while(len(best_features)>0):
best_features_with_constant = sm.add_constant(data[best_features])
p_values = sm.OLS(target, best_features_with_constant).fit().pvalues[1:]
max_p_value = p_values.max()
if(max_p_value >= SL_out):
excluded_feature = p_values.idxmax()
best_features.remove(excluded_feature)
else:
break
else:
break
return best_features
# Storing stepwise selection significant features and displaying them
stepwise_selection_list = stepwise_selection(inputs, targets)
stepwise_selection_list
# Creating significant feature list
feature_list = []
# Adding siginificant features from all three wrapper methods in feature list
for feature in stepwise_selection_list:
if feature in backward_elimination_list and feature in forward_selection_list:
feature_list.append(feature)
# Displaying length of feature list
print(len(feature_list))
# Displaying the feature list
print(feature_list)
# Displaying the inputs
inputs.head()
# Displaying the input distributions as histograms
inputs.hist(figsize=(15, 60), color="purple", grid=False, sharex=False, sharey=False, layout=(22,4))
plt.show()
# Creating the sc object from the StandardScaler class
sc = StandardScaler()
# Fitting and transforming the inputs
scaled_inputs = sc.fit_transform(inputs)
# Splitting the dataset into training and testing
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, random_state=0)
# For each X, calculate VIF (Variance Inflation Factor) and save in dataframe
# VIF is a measure used to determine multicollinearity
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_train, i) for i in range(x_train.shape[1])]
vif["features"] = inputs.columns
# Sorting VIF in descending order
vif.sort_values("VIF Factor",ascending=False).head()
# Storing rows with VIF over 11
inf = vif.loc[vif["VIF Factor"] > 11]
# Storing the feature names that have a VIF over 11
inf_list = list(inf['features'])
# Dropping high VIF factors
inputs = inputs.drop(columns=inf_list)
inputs
# Displaying correlation matrix of the inputs
corr_df = inputs.corr()
sns.heatmap(corr_df)
# Creating regressor object from Linear Regression Class
regressor = LinearRegression()
# Fitting training data on the regressor object
regressor.fit(x_train, y_train)
# Displaying the r2-score of the test data
r2 = regressor.score(x_test, y_test)
# Storing predicted values from the testing data set
y_pred = regressor.predict(x_test)
# Storing the RMSE of the test set
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2
# Storing regressor's prediction on the test data
y_pred = regressor.predict(x_test)
# Creating a dataframe of the prediciton, actual and residuals
residual_df = pd.DataFrame({"Prediction": y_pred * 16,
"Actual":y_test * 16,
"Residual": (y_test - y_pred) * 16})
# Displaying residual dataframe
residual_df.head()
# Average residual for linear regression
residual_df["Residual"].mean()
# Creating an lasso Object from the Lasso Class
lasso = Lasso(random_state=1)
#Parameter grid for grid search
param_grid = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
'max_iter':[10,100,1000,5000]}
# Wrapping lasso model in gridsearch
grid = GridSearchCV(lasso, param_grid = param_grid, scoring='r2', verbose=1, n_jobs=-1)
# Fitting the grid search lasso model with the training data
grid_result = grid.fit(x_train, y_train)
# Printing the best score and parameter
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
# Creating a lasso object from the Lasso Class
lasso = Lasso(random_state=1, alpha=.1, max_iter=10)
# Fitting the training data to the regressor object
lasso.fit(x_train, y_train)
# Displaying the score of the model
lasso.score(x_test, y_test)
# Creating ridge model object from Ridge Class
ridge = Ridge(random_state=1)
# Parameter grid for grid search
param_grid = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
# Wrapping elastic net model in gridsearch
grid = GridSearchCV(ridge, param_grid = param_grid, scoring='r2', verbose=1, n_jobs=-1)
# Fitting the grid search ridge model with the training data
grid_result = grid.fit(x_train, y_train)
# Printing the best score and parameter
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
# Creating a ridge object from the Ridge Class
ridge = Ridge(random_state=1, alpha=1000)
# Fitting the training data to the ridge object
ridge.fit(x_train, y_train)
# Displaying the score of the model
ridge.score(x_test, y_test)
# Creating an elasticnet Object from the ElasticNet Class
elasticnet = ElasticNet(random_state=1)
#Parameter grid for grid search
param_grid = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
'l1_ratio':[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}
# Wrapping elastic net model in gridsearch
grid = GridSearchCV(elasticnet, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
# Fitting the grid search elastic net model with the training data
grid_result = grid.fit(x_train, y_train)
# Printing the best score and parameter
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
# Creating a elastic object from the Elasticnet Class
elastic = ElasticNet(random_state=1, alpha=1, l1_ratio=.9)
# Fitting the training data to the elasticnet object
elastic.fit(x_train, y_train)
# Displaying the score of the model
elastic.score(x_test, y_test)
# Creating a regressor object from the Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=1000, random_state=0)
# Fitting the training data to the regressor object
regressor.fit(sc.inverse_transform(x_train), y_train)
# Storing the Random Forest Regressor feature importances in the importance variable
importance = sorted(zip(regressor.feature_importances_, feature_names), reverse=True)
# Displaying the importance
importance[:10]
# Displaying the r2 score of the random forest regressor
regressor.score(sc.inverse_transform(x_test), y_test)
# Storing regressor's prediction on the test data
y_pred = regressor.predict(x_test)
# Creating a dataframe of the prediciton, actual and residuals
residual_df = pd.DataFrame({"Prediction": y_pred * 16,
"Actual":y_test * 16,
"Residual": (y_test - y_pred) * 16})
# Displaying residual dataframe
residual_df.head()
# Average residual for Random Forest Regressor
residual_df["Residual"].mean()
regressor = SVR(kernel='rbf')
regressor.fit(x_train,y_train)
regressor.score(x_test,y_test)
# Storing regressor's prediction on the test data
y_pred = regressor.predict(x_test)
# Creating a dataframe of the prediciton, actual and residuals
residual_df = pd.DataFrame({"Prediction": y_pred * 16,
"Actual":y_test * 16,
"Residual": (y_test - y_pred) * 16})
# Displaying residual dataframe
residual_df.head()
# Average residual for linear regression
residual_df["Residual"].mean()
# Creating a polyReg object from the PolynomialFeatures class
polyReg = PolynomialFeatures(degree = 3)
# Fitting and transforming the featuring training data to have the polyReg Object's degrees
polyInputs = polyReg.fit_transform(x_train)
# Creating a regressor object from the LinearRegression Class
regressor = LinearRegression()
# Fitting the regressor with the transformed input data and the target training data
regressor.fit(polyInputs,y_train)
# Displaying the score of the polynomial regressor
regressor.score(polyReg.transform(x_test),y_test)
# Storing regressor's prediction on the test data
y_pred = regressor.predict(polyReg.transform(x_test))
# Creating a dataframe of the prediciton, actual and residuals
residual_df = pd.DataFrame({"Prediction": y_pred * 16,
"Actual":y_test * 16,
"Residual": (y_test - y_pred) * 16})
# Displaying residual dataframe
residual_df.head()
# Average residual for linear regression
residual_df["Residual"].mean()
# Creating an knr Object from the K-neighbors Regressor Class
knr = KNeighborsRegressor()
# Parameter grid for grid search
param_grid = {'leaf_size': list(range(1,50,5)),
'n_neighbors' : list(range(1,30,3)),
'p' : [1,2]}
# Wrapping knr model in gridsearch
grid = GridSearchCV(knr, param_grid=param_grid, scoring='r2', verbose=1, n_jobs=-1)
# Fitting the grid search knr model with the training data
grid_result = grid.fit(x_train, y_train.reshape(-1,))
# Printing the best score and parameter
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)
# Making the knr object from the KneighborsRegressor Class
knr = KNeighborsRegressor(leaf_size=1, n_neighbors=28, p=2)
# Fitting the model on the training data
knr.fit(x_train, y_train)
# Scoring the test dataset
knr.score(x_test, y_test)
# parameters = {
# "n_estimators":[5,50,250,500],
# "max_depth":[1,3,5,7,9],
# "learning_rate":[0.01,0.1,1,100]
# }
# rf = GradientBoostingRegressor()
# grid = GridSearchCV(rf, parameters,
# cv = 3, n_jobs = -1, verbose = 1)
# grid_result = grid.fit(x_train, y_train)
# # Printing the best score and parameter
# print('Best Score: ', grid_result.best_score_)
# print('Best Params: ', grid_result.best_params_)
# Creating a gbr class from the GradientBoostingRegressor Object
gbr = GradientBoostingRegressor(random_state=0)
# Fitting the model with unscalled data
gbr.fit(sc.inverse_transform(x_train), y_train)
# Displaying the score on the test data set
gbr.score(sc.inverse_transform(x_test), y_test)
optimizer = Adam(learning_rate=0.01)
loss = 'mse'
NN_model = Sequential()
# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = x_train.shape[1], activation='relu'))
# The Hidden Layers :
NN_model.add(Dense(100, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(100, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(100, kernel_initializer='normal',activation='relu'))
# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))
# Compile the network :
NN_model.compile(loss= loss, optimizer=optimizer, metrics=['mse','mae','accuracy'])
NN_model.summary()
epochs = 250
batch_size = 10
callback = EarlyStopping(monitor="val_loss",patience=5)
history = NN_model.fit(x_train,y_train,
epochs=epochs,
batch_size=batch_size,
validation_split=0.3,
callbacks=[callback])
test_loss, test_mse, test_mae, test_accuracy = NN_model.evaluate(x_test,y_test)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()
plotter = tfdocs.plots.HistoryPlotter(smoothing_std=0)
plotter.plot({'Basic': history}, metric = "mse")
plt.ylim([0, 20])
plt.ylabel('MSE [Revenue]')
# Storing subplots in fig and ax variables
fig, ax = plt.subplots(figsize=(15,12))
# Creating a seaborn heatmap with inputs df
sns.heatmap(inputs.corr(), center=0, cmap="Blues")#annot=True
# Setting the title
ax.set_title("Multi-Collinearity of Football Stats")