NBA Winning Estimator with Decision Tree in Python

It would be interesting to conduct prediction to understand the trend of NBA winning teams.
We will use data from http://www.basketball-reference.com/leagues/NBA_2017_games-june.html and follow workflow.

More details can be found in Robert Layton’s book here: https://www.goodreads.com/book/show/26019855-learning-data-mining-with-python?from_search=true

###1. Load data from http://www.basketball-reference.com/leagues/NBA_2017_games-june.html
import pandas as pd 
file="NBA2017.csv"
NBA2017=pd.read_csv(file,sep=",",parse_dates=["Date"]) #change string of "Date" to date value
NBA2017.columns=["Date", "Start (ET)", 
	"Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT?", "Score Type", "Notes"]
print(NBA2017.head())
#        Date Start (ET)       Visitor Team  VisitorPts  \
#0 2016-10-25    7:30 PM    New York Knicks          88   
#1 2016-10-25   10:30 PM  San Antonio Spurs         129   
#2 2016-10-25   10:00 PM          Utah Jazz         104   
#3 2016-10-26    7:30 PM      Brooklyn Nets         117   
#4 2016-10-26    7:00 PM   Dallas Mavericks         121   #

#                Home Team  HomePts        OT? Score Type Notes  
#0     Cleveland Cavaliers      117  Box Score        NaN   NaN  
#1   Golden State Warriors      100  Box Score        NaN   NaN  
#2  Portland Trail Blazers      113  Box Score        NaN   NaN  
#3          Boston Celtics      122  Box Score        NaN   NaN  
#4          Indiana Pacers      130  Box Score         OT   NaN 
print(NBA2017.dtypes)
#Date            datetime64[ns]
#Start (ET)              object
#Visitor Team            object
#VisitorPts               int64
#Home Team               object
#HomePts                  int64
#OT?                     object
#Score Type              object
#Notes                   object
#dtype: object

#2. Observe data
def rstr(df):
	print('\n''Structure of Data:''\n','\n''Rows X Columns: ',df.shape,'\n'
		'\n''Features and value:''\n',df.apply(lambda x:[x.unique()]))
rstr(NBA2017)
#Structure of Data:
# 
#Rows X Columns:  (1306, 9) #

#Features and value:
# Date            [[2016-10-25 00:00:00, 2016-10-26 00:00:00, 20...
#Start (ET)      [[7:30 PM, 10:30 PM, 10:00 PM, 7:00 PM, 8:00 P...
#Visitor Team    [[New York Knicks, San Antonio Spurs, Utah Jaz...
#VisitorPts      [[88, 129, 104, 117, 121, 114, 98, 107, 108, 1...
#Home TNaN Summary:2536eam       [[Cleveland Cavaliers, Golden State Warriors, ...
#HomePts         [[117, 100, 113, 122, 130, 120, 102, 96, 97, 9...
#OT?                                                 [[Box Score]]
#Score Type                                  [[nan, OT, 2OT, 4OT]]
#Notes           [[nan, at London, England, at Mexico City, Mex...
#dtype: object

print('NaN Summary:%i'%NBA2017.isnull().values.sum())
#NaN Summary:2536

#3.Preprocess, add necessary new features 
NBA2017["HomeWins"]=NBA2017["VisitorPts"]<NBA2017["HomePts"]
y_true = NBA2017["HomeWins"].values
print(NBA2017["HomeWins"].mean())
#0.58269525268
#Need to create a feature to record last game's result as a predictor 
from collections import defaultdict
won_last=defaultdict(int)
NBA2017["HomeLastWin"]=0
NBA2017["VistorLastWin"]=0

for index, row in NBA2017.iterrows():
	home_team=row["Home Team"]
	visitor_team=row["Visitor Team"]
	row["HomeLastWin"]=won_last[home_team]
	NBA2017.set_value(index,"HomeLastWin",won_last[home_team])
	NBA2017.set_value(index,"VisitorLastWin",won_last[visitor_team])
	won_last[home_team] = int(row["HomeWins"])
	won_last[visitor_team] = 1 - int(row["HomeWins"])
print(NBA2017.ix[1000:1005])
#           Date Start (ET)            Visitor Team  VisitorPts  \
#1000 2017-03-14    8:00 PM  Portland Trail Blazers          77   
#1001 2017-03-14    7:30 PM          Indiana Pacers          81   
#1002 2017-03-15    7:30 PM  Minnesota Timberwolves         104   
#1003 2017-03-15    8:00 PM       Memphis Grizzlies          98   
#1004 2017-03-15    7:30 PM               Utah Jazz          97   
#1005 2017-03-15    8:00 PM      Los Angeles Lakers         100   #

#                 Home Team  HomePts        OT? Score Type Notes HomeWins  \
#1000  New Orleans Pelicans      100  Box Score        NaN   NaN     True   
#1001       New York Knicks       87  Box Score        NaN   NaN     True   
#1002        Boston Celtics      117  Box Score        NaN   NaN     True   
#1003         Chicago Bulls       91  Box Score        NaN   NaN    False   
#1004       Detroit Pistons       83  Box Score        NaN   NaN    False   
#1005       Houston Rockets      139  Box Score        NaN   NaN     True   #

#      HomeLastWin  VistorLastWin  VisitorLastWin  
#1000            1              0             1.0  
#1001            0              0             1.0  
#1002            1              0             1.0  
#1003            1              0             1.0  
#1004            0              0             1.0  
#1005            1              0             0.0  

#4.Build the datasets (Split the data)
from sklearn.model_selection import train_test_split
X_previouswins = NBA2017[["HomeLastWin", "VisitorLastWin"]].values
X_train, X_test, y_train, y_test = train_test_split(X_previouswins, y_true, random_state=14)

#5.1 Fit the model of DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(random_state=42, criterion="entropy")
clf.fit(X_train,y_train)

#5.2 Validate
import numpy as np 
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train,scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Accuracy: 59.0%

#5.3 Test
y_predicted = clf.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 56.0%

#6.1 Fit the model of RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(bootstrap=True,class_weight=None,criterion='entropy',max_depth=5,
	max_features='auto',max_leaf_nodes=None,min_samples_leaf=1,min_samples_split=3,
	min_weight_fraction_leaf=0.0,n_estimators=10,n_jobs=1,oob_score=False, random_state=42, 
	verbose=0, warm_start=False)
rfc.fit(X_train,y_train)
scores = cross_val_score(rfc, X_train, y_train, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Accuracy: 59.0%

#6.2 Validate
import numpy as np 
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, X_train, y_train, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Accuracy: 59.0%

#6.3 Test
y_predicted = rfc.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 56.0%

###Apparently, the data we provided above is not sufficient to make more accurate prediciton 
###We will need other data of a ladder in some sports as ranking called "standing"
### http://www.basketball-reference.com/leagues/NBA_2015_standings.html
#1. load the standing data
import os
standings_filename = os.path.join("standings.csv")
standings = pd.read_csv(standings_filename)
print(standings.head())
#   1 Golden State Warriors  67-15   39-2  28-13   25-5  42-10  1-Sep  3-Jul  \
#0  2         Atlanta Hawks  60-22   35-6  25-16  38-14   22-8  6-Dec   14-4   
#1  3       Houston Rockets  56-26  30-11  26-15   23-7  33-19  1-Sep  2-Aug   
#2  4  Los Angeles Clippers  56-26  30-11  26-15  19-11  37-15  3-Jul  4-Jun   
#3  5     Memphis Grizzlies  55-27  31-10  24-17  20-10  35-17  2-Aug  5-May   
#4  6     San Antonio Spurs  55-27   33-8  22-19   23-7  32-20  3-Jul  3-Jul   #

#  1-Sep.1   ...     25-6  3-May   45-9  Jan-00   13-2   3-Nov  3-Dec  3-Aug  \
#0   4-Dec   ...    17-11  4-Jun  30-10     0-1  5-Sep    14-2   17-0  4-Jul   
#1   4-Jun   ...     20-9  4-Aug  31-14  Feb-00  4-Nov   5-Sep  6-Nov  3-Jul   
#2   4-Jun   ...     21-7  5-Mar   33-9  Feb-00  5-Sep   6-Nov  4-Nov  6-May   
#3   3-Jul   ...    16-13  3-Sep  26-13  Feb-00   13-2   6-Aug  4-Dec  4-Jul   
#4   1-Sep   ...     21-8  7-Jun   34-7   1-Jan  3-Nov  10-Aug  4-Oct  5-Jun   #

#    16-2   2-Jun  
#0  7-Sep   3-Apr  
#1  6-Oct   2-Jun  
#2  5-Nov  Jul-00  
#3  8-Sep   3-Apr  
#4  3-Dec   1-Jul  

#2. Observe data(SKIPPED)

#3. Preprocess
NBA2017["HomeTeamRanksHigher"] = 0
for index, row in NBA2017.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = standings[standings["Team"] == home_team]["Rk"].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]["Rk"].values[0]
    NBA2017.set_value(index, "HomeTeamRanksHigher", int(home_rank < visitor_rank))

X_homehigher = NBA2017[[ "HomeTeamRanksHigher", "HomeLastWin", "VisitorLastWin",]].values

#4.Build the datasets (Split the data)
X_train, X_test, y_train, y_test = train_test_split(X_homehigher, y_true, random_state=14)

#5.1 Fit the model of DecisionTreeClassifier
standing_clf = DecisionTreeClassifier(random_state=46, criterion="entropy")
standing_clf.fit(X_train,y_train)

#5.2 Validate
import numpy as np 
from sklearn.model_selection import cross_val_score
scores = cross_val_score(standing_clf, X_train, y_train,scoring='accuracy')
print("Stand_accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Stand_accuracy: 60.9%


#5.3 Test
y_predicted =standing_clf.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 61.2%

###Now we walked through the whole process at the beginning 
last_match_winner = defaultdict(int)
NBA2017["HomeTeamWonLast"] = 0

for index, row in NBA2017.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))  # Sort for a consistent ordering
    # Set in the row, who won the last encounter
    home_team_won_last = 1 if last_match_winner[teams] == row["Home Team"] else 0
    NBA2017.set_value(index, "HomeTeamWonLast", home_team_won_last)
    winner = row["Home Team"] if row["HomeWins"] else row["Visitor Team"]
    last_match_winner[teams] = winner
print(NBA2017.ix[400:405])
#SKIPPED

X_lastwinner = NBA2017[[ "HomeTeamWonLast", "HomeTeamRanksHigher", "HomeLastWin", 
"VisitorLastWin",]].values
X_train, X_test, y_train, y_test = train_test_split(X_lastwinner, y_true, random_state=14)
Last_clf = DecisionTreeClassifier(random_state=24, criterion="entropy")
scores = cross_val_score(Last_clf, X_train, y_train, scoring='accuracy')
print("Last_Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Last_Accuracy: 60.7%

Last_clf.fit(X_train,y_train)
y_predicted =Last_clf.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 59.6%

###If we want to consider the effect of the team names, we can use 
###from sklearn.preprocessing import LabelEncoder###
#&
###from sklearn.preprocessing import OneHotEncoder###
#to convert team names to interactive features, we will NOT do it here

Last_rfc = RandomForestClassifier(random_state=14)
scores = cross_val_score(Last_rfc, X_train, y_train, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
#Accuracy: 60.0%

Last_rfc.fit(X_train,y_train)
y_predicted =Last_rfc.predict(X_test)
accuracy = np.mean(y_test == y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))
#The accuracy is 59.6%
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s