In [69]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
%matplotlib inline
In [227]:
kenPomDataWithIds = pd.read_csv('data/kenPomTeamData.csv')
kenPomDataWithIds = kenPomDataWithIds[kenPomDataWithIds['Season'] > 2012]
kenPomDataWithIds = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if 'Rank' not in col ] ]
In [228]:
idCols = ['Season', 'Unnamed: 0', 'TeamName', 'Team_Id']
kenPomData = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if col not in idCols ] ]
In [229]:
# remove NaN
# not sure if I have to do this after normalizing hgt data column names
colsWithNaN = [ col for col in kenPomData if kenPomData[col].isnull().any() ]
kenPomData = kenPomData[ [ col for col in kenPomData if col not in colsWithNaN ] ]
In [231]:
# take a look at column stats
kenPomData.describe().T.sort_values(by = 'std')[::-1]
Out[231]:
In [233]:
# take a look at correlations
corrmat = kenPomData.corr()
f, ax = plt.subplots(figsize = (12, 9))
sns.heatmap(corrmat, square=True)
plt.show()
In [234]:
kpPca = PCA()
kpPca.fit(kenPomData.values)
Out[234]:
In [245]:
sns.plt.plot(kpPca.explained_variance_ratio_.cumsum())
sns.plt.show()
In [237]:
# take a look at the components
n_components = 10
f, allAx = plt.subplots(n_components, 1, figsize=(15, 4 * n_components))
f.subplots_adjust(top = 1.3)
for i in range(len(allAx)):
b = sns.barplot(x = kenPomData.columns, y = np.abs(kpPca.components_[i]), ax = allAx[i])
b.set_xticklabels(kenPomData.columns, rotation = 90)
plt.show()
In [244]:
# which components contribute more than 0.3 to a component in the first 95% of the variance?
importanceThresh = 0.5**2
kpCumComponentVar = kpPca.components_.cumsum()
idxNinetyFive = next(i for i in range(len(kpCumComponentVar)) if kpCumComponentVar[i] > 0.95)
importantPCAVariables = set()
for component in kpPca.components_[:idxNinetyFive]:
componentImportantVariables = set([ i for i in range(len(component)) if component[i]**2 > importanceThresh ])
importantPCAVariables = importantPCAVariables.union(componentImportantVariables)
importantPCAVariables = list(map(lambda idx: kenPomData.columns[idx], importantPCAVariables))
print(int(100*len(importantPCAVariables) / len(kenPomData.columns) + 0.5), '% of features seem important', sep = '')
print('meaningful variables:\n', importantPCAVariables)