notebook.community

Edit and run



In [69]:

    
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
%matplotlib inline



In [227]:

    
kenPomDataWithIds = pd.read_csv('data/kenPomTeamData.csv')
kenPomDataWithIds = kenPomDataWithIds[kenPomDataWithIds['Season'] > 2012]
kenPomDataWithIds = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if 'Rank' not in col ] ]



In [228]:

    
idCols = ['Season', 'Unnamed: 0', 'TeamName', 'Team_Id']
kenPomData = kenPomDataWithIds[ [ col for col in kenPomDataWithIds if col not in idCols ] ]



In [229]:

    
# remove NaN
# not sure if I have to do this after normalizing hgt data column names

colsWithNaN = [ col for col in kenPomData if kenPomData[col].isnull().any() ]
kenPomData = kenPomData[ [ col for col in kenPomData if col not in colsWithNaN ] ]



In [231]:

    
# take a look at column stats

kenPomData.describe().T.sort_values(by = 'std')[::-1]









    Out[231]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      AdjEM
      1751.0
      0.000126
      11.928057
      -46.110000
      -8.390000
      -0.830000
      7.934670
      36.900000
    
    
      AdjOE
      1751.0
      102.885645
      7.360252
      74.270000
      97.940650
      102.710000
      107.707000
      126.950000
    
    
      ORC
      1751.0
      34.933432
      6.774318
      0.010000
      30.510000
      34.510000
      39.225000
      60.310000
    
    
      OE
      1751.0
      102.501754
      6.682151
      75.880000
      98.050000
      102.680000
      106.820000
      122.857000
    
    
      AdjDE
      1751.0
      102.885517
      6.653718
      82.460000
      98.415000
      103.170000
      107.529000
      121.964000
    
    
      ORPF
      1751.0
      28.078835
      6.544270
      0.000000
      23.890000
      27.940000
      32.160000
      59.120000
    
    
      defFTRate
      1751.0
      37.389800
      6.466229
      19.963201
      32.851592
      37.000700
      41.485660
      60.742300
    
    
      ORSF
      1751.0
      17.619783
      6.016728
      0.000000
      13.330000
      17.100000
      21.380000
      99.980000
    
    
      DE
      1751.0
      102.975450
      5.854790
      84.310000
      98.975000
      103.250000
      106.965000
      122.345000
    
    
      ARate
      1751.0
      52.583720
      5.517699
      35.182500
      48.752339
      52.327200
      56.262250
      71.258700
    
    
      Bench
      1751.0
      31.595694
      5.480651
      13.670000
      27.885000
      31.680000
      35.360000
      49.540000
    
    
      offFTRate
      1751.0
      37.099190
      5.244431
      21.414538
      33.462413
      36.941410
      40.567030
      58.585900
    
    
      OppARate
      1751.0
      52.786651
      5.043708
      39.277108
      49.326650
      52.600300
      56.028250
      72.266200
    
    
      DRC
      1751.0
      26.364289
      4.968006
      0.010000
      23.420000
      26.150000
      29.080000
      99.960000
    
    
      DRPG
      1751.0
      14.057801
      4.490671
      0.010000
      12.085000
      13.750000
      15.625000
      99.970000
    
    
      ORSG
      1751.0
      11.170023
      4.475188
      0.000000
      8.065000
      10.610000
      13.580000
      43.230000
    
    
      DRPF
      1751.0
      24.246933
      4.412403
      0.010000
      21.740000
      24.090000
      26.580000
      99.960000
    
    
      PtsPG
      1751.0
      19.356208
      4.319474
      8.620000
      16.295000
      18.980000
      22.095000
      33.640000
    
    
      offORPct
      1751.0
      30.582126
      4.128940
      13.426854
      27.908040
      30.604650
      33.407500
      43.826100
    
    
      PtsC
      1751.0
      19.262701
      4.031621
      8.460000
      16.585000
      19.050000
      21.855000
      34.000000
    
    
      ORPG
      1751.0
      8.180240
      3.953338
      0.000000
      5.850000
      7.680000
      9.960000
      99.980000
    
    
      FTPct
      1751.0
      69.600811
      3.769245
      56.504900
      66.981850
      69.577500
      72.289697
      82.828283
    
    
      DRSG
      1751.0
      16.081462
      3.662340
      0.010000
      13.905000
      15.890000
      17.930000
      80.800000
    
    
      PtsSG
      1751.0
      20.621953
      3.661595
      9.010000
      18.095000
      20.590000
      22.985000
      34.570000
    
    
      DRSF
      1751.0
      19.234752
      3.619687
      0.010000
      16.980000
      19.080000
      21.400000
      52.900000
    
    
      PtsSF
      1751.0
      20.424472
      3.583453
      7.790000
      18.055000
      20.290000
      22.730000
      33.410000
    
    
      Tempo
      1751.0
      67.346562
      3.448001
      57.060000
      64.930000
      67.280000
      69.624850
      83.759900
    
    
      OppFG2Pct
      1751.0
      48.441529
      3.446107
      37.641357
      46.012219
      48.422800
      50.884100
      59.974425
    
    
      PtsPF
      1751.0
      20.322953
      3.424899
      9.640000
      17.995000
      20.190000
      22.545000
      33.200000
    
    
      FG2Pct
      1751.0
      48.268598
      3.376487
      37.606838
      45.969550
      48.273200
      50.428636
      62.556100
    
    
      AdjTempo
      1751.0
      67.175375
      3.241102
      57.780000
      64.910000
      67.095700
      69.211450
      83.446800
    
    
      offeFGPct
      1751.0
      49.384335
      3.218561
      39.227600
      47.252143
      49.331400
      51.551950
      61.689910
    
    
      defORPct
      1751.0
      30.780149
      3.077144
      20.871143
      28.694738
      30.833333
      32.787500
      41.048825
    
    
      defeFGPct
      1751.0
      49.571613
      3.056477
      39.557855
      47.394897
      49.607700
      51.608600
      59.360731
    
    
      FG3Pct
      1751.0
      34.291679
      2.964160
      24.870500
      32.251700
      34.242400
      36.318150
      44.117600
    
    
      BlockPct
      1751.0
      9.342820
      2.786187
      2.927200
      7.234900
      9.074410
      11.166150
      20.190779
    
    
      OppFG3Pct
      1751.0
      34.493254
      2.596979
      27.089800
      32.699250
      34.428600
      36.223205
      43.795620
    
    
      OppFTPct
      1751.0
      69.672968
      2.498298
      59.176030
      68.018700
      69.776600
      71.340800
      77.611940
    
    
      defTOPct
      1751.0
      18.870992
      2.292262
      10.214000
      17.291300
      18.786500
      20.276750
      32.708337
    
    
      offTOPct
      1751.0
      18.939534
      2.157613
      11.900000
      17.475950
      18.802200
      20.277435
      27.183900
    
    
      HgtEff
      1751.0
      -0.000126
      1.920951
      -7.310000
      -1.220000
      -0.080000
      1.195000
      9.330000
    
    
      OppBlockPct
      1751.0
      9.500723
      1.723363
      4.037685
      8.291300
      9.396914
      10.576144
      18.478261
    
    
      HgtPG
      1751.0
      -0.000074
      1.616856
      -6.290000
      -1.060000
      0.080000
      1.160000
      4.760000
    
    
      HgtC
      1751.0
      0.000103
      1.252458
      -3.560000
      -0.910000
      -0.060000
      0.730000
      6.960000
    
    
      HgtSG
      1751.0
      -0.000029
      1.244951
      -5.560000
      -0.815000
      0.010000
      0.865000
      4.140000
    
    
      HgtSF
      1751.0
      0.000194
      1.135412
      -4.370000
      -0.770000
      -0.050000
      0.850000
      4.110000
    
    
      HgtPF
      1751.0
      0.000051
      1.085649
      -3.780000
      -0.730000
      -0.030000
      0.790000
      4.340000
    
    
      Height
      1751.0
      76.704980
      0.912441
      73.430000
      76.080000
      76.690000
      77.350000
      79.400000
    
    
      Exp
      1751.0
      1.698744
      0.398968
      0.280000
      1.430000
      1.710000
      1.990000
      2.850000
    
    
      StlRate
      1751.0
      0.091188
      0.016980
      0.042100
      0.079684
      0.090000
      0.101700
      0.170273
    
    
      OppStlRate
      1751.0
      0.091697
      0.013406
      0.048849
      0.082607
      0.090502
      0.099998
      0.155200



In [233]:

    
# take a look at correlations

corrmat = kenPomData.corr()
f, ax = plt.subplots(figsize = (12, 9))
sns.heatmap(corrmat, square=True)
plt.show()



In [234]:

    
kpPca =  PCA()
kpPca.fit(kenPomData.values)









    Out[234]:





PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)



In [245]:

    
sns.plt.plot(kpPca.explained_variance_ratio_.cumsum())
sns.plt.show()



In [237]:

    
# take a look at the components

n_components = 10
f, allAx = plt.subplots(n_components, 1, figsize=(15, 4 * n_components))
f.subplots_adjust(top = 1.3)
for i in range(len(allAx)):
    b = sns.barplot(x = kenPomData.columns, y = np.abs(kpPca.components_[i]), ax = allAx[i])
    b.set_xticklabels(kenPomData.columns, rotation = 90)
plt.show()



In [244]:

    
# which components contribute more than 0.3 to a component in the first 95% of the variance?

importanceThresh = 0.5**2
kpCumComponentVar = kpPca.components_.cumsum()
idxNinetyFive = next(i for i in range(len(kpCumComponentVar)) if kpCumComponentVar[i] > 0.95)
importantPCAVariables = set()
for component in kpPca.components_[:idxNinetyFive]:
    componentImportantVariables = set([ i for i in range(len(component)) if component[i]**2 > importanceThresh ])
    importantPCAVariables = importantPCAVariables.union(componentImportantVariables)
importantPCAVariables = list(map(lambda idx: kenPomData.columns[idx], importantPCAVariables))
print(int(100*len(importantPCAVariables) / len(kenPomData.columns) + 0.5), '% of features seem important', sep = '')
print('meaningful variables:\n', importantPCAVariables)









    



65% of features seem important
meaningful variables:
 ['ARate', 'AdjDE', 'AdjEM', 'AdjOE', 'AdjTempo', 'BlockPct', 'DRPG', 'Exp', 'FG3Pct', 'FTPct', 'Height', 'HgtC', 'HgtEff', 'HgtPF', 'HgtPG', 'HgtSF', 'HgtSG', 'ORC', 'ORPF', 'ORPG', 'OppARate', 'OppBlockPct', 'OppFG3Pct', 'OppFTPct', 'OppStlRate', 'PtsPG', 'PtsSF', 'StlRate', 'Tempo', 'defFTRate', 'defORPct', 'defeFGPct', 'offeFGPct']

	count	mean	std	min	25%	50%	75%	max
AdjEM	1751.0	0.000126	11.928057	-46.110000	-8.390000	-0.830000	7.934670	36.900000
AdjOE	1751.0	102.885645	7.360252	74.270000	97.940650	102.710000	107.707000	126.950000
ORC	1751.0	34.933432	6.774318	0.010000	30.510000	34.510000	39.225000	60.310000
OE	1751.0	102.501754	6.682151	75.880000	98.050000	102.680000	106.820000	122.857000
AdjDE	1751.0	102.885517	6.653718	82.460000	98.415000	103.170000	107.529000	121.964000
ORPF	1751.0	28.078835	6.544270	0.000000	23.890000	27.940000	32.160000	59.120000
defFTRate	1751.0	37.389800	6.466229	19.963201	32.851592	37.000700	41.485660	60.742300
ORSF	1751.0	17.619783	6.016728	0.000000	13.330000	17.100000	21.380000	99.980000
DE	1751.0	102.975450	5.854790	84.310000	98.975000	103.250000	106.965000	122.345000
ARate	1751.0	52.583720	5.517699	35.182500	48.752339	52.327200	56.262250	71.258700
Bench	1751.0	31.595694	5.480651	13.670000	27.885000	31.680000	35.360000	49.540000
offFTRate	1751.0	37.099190	5.244431	21.414538	33.462413	36.941410	40.567030	58.585900
OppARate	1751.0	52.786651	5.043708	39.277108	49.326650	52.600300	56.028250	72.266200
DRC	1751.0	26.364289	4.968006	0.010000	23.420000	26.150000	29.080000	99.960000
DRPG	1751.0	14.057801	4.490671	0.010000	12.085000	13.750000	15.625000	99.970000
ORSG	1751.0	11.170023	4.475188	0.000000	8.065000	10.610000	13.580000	43.230000
DRPF	1751.0	24.246933	4.412403	0.010000	21.740000	24.090000	26.580000	99.960000
PtsPG	1751.0	19.356208	4.319474	8.620000	16.295000	18.980000	22.095000	33.640000
offORPct	1751.0	30.582126	4.128940	13.426854	27.908040	30.604650	33.407500	43.826100
PtsC	1751.0	19.262701	4.031621	8.460000	16.585000	19.050000	21.855000	34.000000
ORPG	1751.0	8.180240	3.953338	0.000000	5.850000	7.680000	9.960000	99.980000
FTPct	1751.0	69.600811	3.769245	56.504900	66.981850	69.577500	72.289697	82.828283
DRSG	1751.0	16.081462	3.662340	0.010000	13.905000	15.890000	17.930000	80.800000
PtsSG	1751.0	20.621953	3.661595	9.010000	18.095000	20.590000	22.985000	34.570000
DRSF	1751.0	19.234752	3.619687	0.010000	16.980000	19.080000	21.400000	52.900000
PtsSF	1751.0	20.424472	3.583453	7.790000	18.055000	20.290000	22.730000	33.410000
Tempo	1751.0	67.346562	3.448001	57.060000	64.930000	67.280000	69.624850	83.759900
OppFG2Pct	1751.0	48.441529	3.446107	37.641357	46.012219	48.422800	50.884100	59.974425
PtsPF	1751.0	20.322953	3.424899	9.640000	17.995000	20.190000	22.545000	33.200000
FG2Pct	1751.0	48.268598	3.376487	37.606838	45.969550	48.273200	50.428636	62.556100
AdjTempo	1751.0	67.175375	3.241102	57.780000	64.910000	67.095700	69.211450	83.446800
offeFGPct	1751.0	49.384335	3.218561	39.227600	47.252143	49.331400	51.551950	61.689910
defORPct	1751.0	30.780149	3.077144	20.871143	28.694738	30.833333	32.787500	41.048825
defeFGPct	1751.0	49.571613	3.056477	39.557855	47.394897	49.607700	51.608600	59.360731
FG3Pct	1751.0	34.291679	2.964160	24.870500	32.251700	34.242400	36.318150	44.117600
BlockPct	1751.0	9.342820	2.786187	2.927200	7.234900	9.074410	11.166150	20.190779
OppFG3Pct	1751.0	34.493254	2.596979	27.089800	32.699250	34.428600	36.223205	43.795620
OppFTPct	1751.0	69.672968	2.498298	59.176030	68.018700	69.776600	71.340800	77.611940
defTOPct	1751.0	18.870992	2.292262	10.214000	17.291300	18.786500	20.276750	32.708337
offTOPct	1751.0	18.939534	2.157613	11.900000	17.475950	18.802200	20.277435	27.183900
HgtEff	1751.0	-0.000126	1.920951	-7.310000	-1.220000	-0.080000	1.195000	9.330000
OppBlockPct	1751.0	9.500723	1.723363	4.037685	8.291300	9.396914	10.576144	18.478261
HgtPG	1751.0	-0.000074	1.616856	-6.290000	-1.060000	0.080000	1.160000	4.760000
HgtC	1751.0	0.000103	1.252458	-3.560000	-0.910000	-0.060000	0.730000	6.960000
HgtSG	1751.0	-0.000029	1.244951	-5.560000	-0.815000	0.010000	0.865000	4.140000
HgtSF	1751.0	0.000194	1.135412	-4.370000	-0.770000	-0.050000	0.850000	4.110000
HgtPF	1751.0	0.000051	1.085649	-3.780000	-0.730000	-0.030000	0.790000	4.340000
Height	1751.0	76.704980	0.912441	73.430000	76.080000	76.690000	77.350000	79.400000
Exp	1751.0	1.698744	0.398968	0.280000	1.430000	1.710000	1.990000	2.850000
StlRate	1751.0	0.091188	0.016980	0.042100	0.079684	0.090000	0.101700	0.170273
OppStlRate	1751.0	0.091697	0.013406	0.048849	0.082607	0.090502	0.099998	0.155200