In [54]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
In [23]:
models = {'svm': LinearSVC(),
'log_reg': LogisticRegression(),
'naive_baives': MultinomialNB(),
'knn': KNeighborsClassifier(),
'dec_tree': DecisionTreeClassifier()}
Read in the Kobe Bryant shooting data [https://www.kaggle.com/c/kobe-bryant-shot-selection]
In [87]:
kobe = pd.read_csv('../data/kobe.csv')
kobe.dropna(inplace=True)
For now, use just the numerical datatypes. They are below as num_columns
In [39]:
kobe
Out[39]:
In [ ]:
In [52]:
[(col, dtype) for col, dtype in zip(kobe.columns, kobe.dtypes) if dtype != 'object']
num_columns = [col for col, dtype in zip(kobe.columns, kobe.dtypes) if dtype != 'object']
num_columns
Out[52]:
In [ ]:
#kobe.hist()
In [77]:
fig, ax = plt.subplots()
kobe[kobe.shot_made_flag==0].plot(kind='scatter', x='loc_x', y='loc_y', color='blue', alpha=0.1, ax=ax)
kobe[kobe.shot_made_flag==1].plot(kind='scatter', x='loc_x', y='loc_y', color='green', alpha=0.1, ax=ax)
# plt.scatter(kobe.loc_x, kobe.loc_y, alpha=0.2)
Out[77]:
In [68]:
kobe[kobe.shot_made_flag==0].shot_distance.hist(bins=range(0,70,2), alpha=.4)
kobe[kobe.shot_made_flag==1].shot_distance.hist(bins=range(0,70,2), alpha=.4)
Out[68]:
num_columns
, the kobe
dataframe to fit()
the models
. Choose one or more of the entries in num_columns
as features. These models are used to predict whether Kobe will make or miss a shot given the certain input parameters provided.
In [27]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(font_scale=1.5)
In [94]:
# fit a linear regression model and store the predictions
feature_cols = ['shot_distance', 'minutes_remaining']
X = kobe[feature_cols] #kobe[['shot_distance', 'minutes_remaining']]
y = kobe.shot_made_flag
# from sklearn.linear_model import LinearRegression as Model
from sklearn.linear_model import LogisticRegression as Model
# from sklearn.tree import DecisionTreeClassifier as Model
# from sklearn.ensemble import RandomForestClassifier as Model
model = Model()
model.fit(X, y)
kobe['pred'] = model.predict(X)
# scatter plot that includes the regression line
plt.scatter(kobe.shot_distance, kobe.shot_made_flag)
plt.scatter(kobe.shot_distance, kobe.pred, color='red', alpha=.2)
plt.xlabel('dist')
plt.ylabel('made')
from sklearn.metrics import accuracy_score
accuracy_score(kobe.shot_made_flag, kobe.pred.round())
Out[94]:
In [45]:
# fit a linear regression model and store the predictions
example = pd.DataFrame({'a':[1,2,3,4,5,6], 'b':[1,1,0,0,0,1]})
feature_cols = ['a']
X = example[feature_cols]
y = example.b
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X, y)
example['pred'] = model.predict(X)
# scatter plot that includes the regression line
plt.scatter(example.a, example.b)
plt.plot(example.a, example.pred, color='red')
plt.xlabel('a')
plt.ylabel('b')
from sklearn.metrics import accuracy_score
accuracy_score(example.b, example.pred.astype(int))
Out[45]:
In [ ]: