In [69]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
%matplotlib inline

In [227]:
kenPomDataWithIds = pd.read_csv('data/kenPomTeamData.csv')
kenPomDataWithIds = kenPomDataWithIds[kenPomDataWithIds['Season'] > 2012]
kenPomDataWithIds = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if 'Rank' not in col ] ]

In [228]:
idCols = ['Season', 'Unnamed: 0', 'TeamName', 'Team_Id']
kenPomData = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if col not in idCols ] ]

In [229]:
# remove NaN
# not sure if I have to do this after normalizing hgt data column names

colsWithNaN = [ col for col in kenPomData if kenPomData[col].isnull().any() ]
kenPomData = kenPomData[ [ col for col in kenPomData if col not in colsWithNaN ] ]

In [231]:
# take a look at column stats

kenPomData.describe().T.sort_values(by = 'std')[::-1]


Out[231]:
count mean std min 25% 50% 75% max
AdjEM 1751.0 0.000126 11.928057 -46.110000 -8.390000 -0.830000 7.934670 36.900000
AdjOE 1751.0 102.885645 7.360252 74.270000 97.940650 102.710000 107.707000 126.950000
ORC 1751.0 34.933432 6.774318 0.010000 30.510000 34.510000 39.225000 60.310000
OE 1751.0 102.501754 6.682151 75.880000 98.050000 102.680000 106.820000 122.857000
AdjDE 1751.0 102.885517 6.653718 82.460000 98.415000 103.170000 107.529000 121.964000
ORPF 1751.0 28.078835 6.544270 0.000000 23.890000 27.940000 32.160000 59.120000
defFTRate 1751.0 37.389800 6.466229 19.963201 32.851592 37.000700 41.485660 60.742300
ORSF 1751.0 17.619783 6.016728 0.000000 13.330000 17.100000 21.380000 99.980000
DE 1751.0 102.975450 5.854790 84.310000 98.975000 103.250000 106.965000 122.345000
ARate 1751.0 52.583720 5.517699 35.182500 48.752339 52.327200 56.262250 71.258700
Bench 1751.0 31.595694 5.480651 13.670000 27.885000 31.680000 35.360000 49.540000
offFTRate 1751.0 37.099190 5.244431 21.414538 33.462413 36.941410 40.567030 58.585900
OppARate 1751.0 52.786651 5.043708 39.277108 49.326650 52.600300 56.028250 72.266200
DRC 1751.0 26.364289 4.968006 0.010000 23.420000 26.150000 29.080000 99.960000
DRPG 1751.0 14.057801 4.490671 0.010000 12.085000 13.750000 15.625000 99.970000
ORSG 1751.0 11.170023 4.475188 0.000000 8.065000 10.610000 13.580000 43.230000
DRPF 1751.0 24.246933 4.412403 0.010000 21.740000 24.090000 26.580000 99.960000
PtsPG 1751.0 19.356208 4.319474 8.620000 16.295000 18.980000 22.095000 33.640000
offORPct 1751.0 30.582126 4.128940 13.426854 27.908040 30.604650 33.407500 43.826100
PtsC 1751.0 19.262701 4.031621 8.460000 16.585000 19.050000 21.855000 34.000000
ORPG 1751.0 8.180240 3.953338 0.000000 5.850000 7.680000 9.960000 99.980000
FTPct 1751.0 69.600811 3.769245 56.504900 66.981850 69.577500 72.289697 82.828283
DRSG 1751.0 16.081462 3.662340 0.010000 13.905000 15.890000 17.930000 80.800000
PtsSG 1751.0 20.621953 3.661595 9.010000 18.095000 20.590000 22.985000 34.570000
DRSF 1751.0 19.234752 3.619687 0.010000 16.980000 19.080000 21.400000 52.900000
PtsSF 1751.0 20.424472 3.583453 7.790000 18.055000 20.290000 22.730000 33.410000
Tempo 1751.0 67.346562 3.448001 57.060000 64.930000 67.280000 69.624850 83.759900
OppFG2Pct 1751.0 48.441529 3.446107 37.641357 46.012219 48.422800 50.884100 59.974425
PtsPF 1751.0 20.322953 3.424899 9.640000 17.995000 20.190000 22.545000 33.200000
FG2Pct 1751.0 48.268598 3.376487 37.606838 45.969550 48.273200 50.428636 62.556100
AdjTempo 1751.0 67.175375 3.241102 57.780000 64.910000 67.095700 69.211450 83.446800
offeFGPct 1751.0 49.384335 3.218561 39.227600 47.252143 49.331400 51.551950 61.689910
defORPct 1751.0 30.780149 3.077144 20.871143 28.694738 30.833333 32.787500 41.048825
defeFGPct 1751.0 49.571613 3.056477 39.557855 47.394897 49.607700 51.608600 59.360731
FG3Pct 1751.0 34.291679 2.964160 24.870500 32.251700 34.242400 36.318150 44.117600
BlockPct 1751.0 9.342820 2.786187 2.927200 7.234900 9.074410 11.166150 20.190779
OppFG3Pct 1751.0 34.493254 2.596979 27.089800 32.699250 34.428600 36.223205 43.795620
OppFTPct 1751.0 69.672968 2.498298 59.176030 68.018700 69.776600 71.340800 77.611940
defTOPct 1751.0 18.870992 2.292262 10.214000 17.291300 18.786500 20.276750 32.708337
offTOPct 1751.0 18.939534 2.157613 11.900000 17.475950 18.802200 20.277435 27.183900
HgtEff 1751.0 -0.000126 1.920951 -7.310000 -1.220000 -0.080000 1.195000 9.330000
OppBlockPct 1751.0 9.500723 1.723363 4.037685 8.291300 9.396914 10.576144 18.478261
HgtPG 1751.0 -0.000074 1.616856 -6.290000 -1.060000 0.080000 1.160000 4.760000
HgtC 1751.0 0.000103 1.252458 -3.560000 -0.910000 -0.060000 0.730000 6.960000
HgtSG 1751.0 -0.000029 1.244951 -5.560000 -0.815000 0.010000 0.865000 4.140000
HgtSF 1751.0 0.000194 1.135412 -4.370000 -0.770000 -0.050000 0.850000 4.110000
HgtPF 1751.0 0.000051 1.085649 -3.780000 -0.730000 -0.030000 0.790000 4.340000
Height 1751.0 76.704980 0.912441 73.430000 76.080000 76.690000 77.350000 79.400000
Exp 1751.0 1.698744 0.398968 0.280000 1.430000 1.710000 1.990000 2.850000
StlRate 1751.0 0.091188 0.016980 0.042100 0.079684 0.090000 0.101700 0.170273
OppStlRate 1751.0 0.091697 0.013406 0.048849 0.082607 0.090502 0.099998 0.155200

In [233]:
# take a look at correlations

corrmat = kenPomData.corr()
f, ax = plt.subplots(figsize = (12, 9))
sns.heatmap(corrmat, square=True)
plt.show()



In [234]:
kpPca =  PCA()
kpPca.fit(kenPomData.values)


Out[234]:
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [245]:
sns.plt.plot(kpPca.explained_variance_ratio_.cumsum())
sns.plt.show()



In [237]:
# take a look at the components

n_components = 10
f, allAx = plt.subplots(n_components, 1, figsize=(15, 4 * n_components))
f.subplots_adjust(top = 1.3)
for i in range(len(allAx)):
    b = sns.barplot(x = kenPomData.columns, y = np.abs(kpPca.components_[i]), ax = allAx[i])
    b.set_xticklabels(kenPomData.columns, rotation = 90)
plt.show()



In [244]:
# which components contribute more than 0.3 to a component in the first 95% of the variance?

importanceThresh = 0.5**2
kpCumComponentVar = kpPca.components_.cumsum()
idxNinetyFive = next(i for i in range(len(kpCumComponentVar)) if kpCumComponentVar[i] > 0.95)
importantPCAVariables = set()
for component in kpPca.components_[:idxNinetyFive]:
    componentImportantVariables = set([ i for i in range(len(component)) if component[i]**2 > importanceThresh ])
    importantPCAVariables = importantPCAVariables.union(componentImportantVariables)
importantPCAVariables = list(map(lambda idx: kenPomData.columns[idx], importantPCAVariables))
print(int(100*len(importantPCAVariables) / len(kenPomData.columns) + 0.5), '% of features seem important', sep = '')
print('meaningful variables:\n', importantPCAVariables)


65% of features seem important
meaningful variables:
 ['ARate', 'AdjDE', 'AdjEM', 'AdjOE', 'AdjTempo', 'BlockPct', 'DRPG', 'Exp', 'FG3Pct', 'FTPct', 'Height', 'HgtC', 'HgtEff', 'HgtPF', 'HgtPG', 'HgtSF', 'HgtSG', 'ORC', 'ORPF', 'ORPG', 'OppARate', 'OppBlockPct', 'OppFG3Pct', 'OppFTPct', 'OppStlRate', 'PtsPG', 'PtsSF', 'StlRate', 'Tempo', 'defFTRate', 'defORPct', 'defeFGPct', 'offeFGPct']