notebook.community

Edit and run



In [106]:

    
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf



In [107]:

    
import pandas as pd
import numpy as np
from sklearn import grid_search, datasets
from spark_sklearn import GridSearchCV
from sklearn import ensemble
from pyspark.sql import SparkSession
from spark_sklearn.util import createLocalSparkSession
from patsy import dmatrices
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB



In [108]:

    
df = pd.read_csv("../data/matcheswithfeatures.csv", index_col = 0)
df.tail()









    Out[108]:







  
    
      
      id
      season
      city
      date
      team1
      team2
      toss_winner
      toss_decision
      result
      dl_applied
      ...
      venue
      umpire1
      umpire2
      umpire3
      Avg_SR_Difference
      Avg_WPR_Difference
      Total_MVP_Difference
      Prev_Enc_Team1_WinPerc
      Total_RF_Difference
      team1Winning
    
  
  
    
      510
      511
      2015
      Mumbai
      2015-05-16
      Rajasthan Royals
      Kolkata Knight Riders
      Rajasthan Royals
      bat
      normal
      0
      ...
      Brabourne Stadium
      RM Deshpande
      RK Illingworth
      NaN
      -3.303823
      -0.271935
      -16
      50.000000
      0.000000
      1
    
    
      513
      514
      2015
      Mumbai
      2015-05-19
      Mumbai Indians
      Chennai Super Kings
      Mumbai Indians
      bat
      normal
      0
      ...
      Wankhede Stadium
      HDPK Dharmasena
      RK Illingworth
      NaN
      6.315981
      -0.617777
      -24
      50.000000
      0.000000
      1
    
    
      514
      515
      2015
      Pune
      2015-05-20
      Royal Challengers Bangalore
      Rajasthan Royals
      Royal Challengers Bangalore
      bat
      normal
      0
      ...
      Maharashtra Cricket Association Stadium
      AK Chaudhary
      C Shamshuddin
      NaN
      -2.200375
      0.969143
      5
      50.000000
      0.000000
      1
    
    
      515
      516
      2015
      Ranchi
      2015-05-22
      Royal Challengers Bangalore
      Chennai Super Kings
      Chennai Super Kings
      field
      normal
      0
      ...
      JSCA International Stadium Complex
      AK Chaudhary
      CB Gaffaney
      NaN
      -0.521025
      1.039181
      -23
      38.888889
      33.333333
      0
    
    
      516
      517
      2015
      Kolkata
      2015-05-24
      Mumbai Indians
      Chennai Super Kings
      Chennai Super Kings
      field
      normal
      0
      ...
      Eden Gardens
      HDPK Dharmasena
      RK Illingworth
      NaN
      -1.575550
      -1.707931
      -24
      52.380952
      0.000000
      1
    
  

5 rows × 24 columns



In [109]:

    
spark = createLocalSparkSession()



In [110]:

    
y, X = dmatrices('team1Winning ~ 0 + Avg_SR_Difference + Avg_WPR_Difference + Total_MVP_Difference + Prev_Enc_Team1_WinPerc + \
                  Total_RF_Difference', df, return_type="dataframe")
y_arr = np.ravel(y)



In [111]:

    
X.tail()









    Out[111]:







  
    
      
      Avg_SR_Difference
      Avg_WPR_Difference
      Total_MVP_Difference
      Prev_Enc_Team1_WinPerc
      Total_RF_Difference
    
  
  
    
      510
      -3.303823
      -0.271935
      -16.0
      50.000000
      0.000000
    
    
      513
      6.315981
      -0.617777
      -24.0
      50.000000
      0.000000
    
    
      514
      -2.200375
      0.969143
      5.0
      50.000000
      0.000000
    
    
      515
      -0.521025
      1.039181
      -23.0
      38.888889
      33.333333
    
    
      516
      -1.575550
      -1.707931
      -24.0
      52.380952
      0.000000



In [112]:

    
X_timetrain = X.loc[X.index < 398]
Y_timetrain = y.loc[y.index < 398]
Y_timetrain_arr = np.ravel(Y_timetrain)
X_timetest = X.loc[X.index >= 398]
Y_timetest = y.loc[y.index >= 398]
Y_timetest_arr = np.ravel(Y_timetest)
X_timetest









    Out[112]:







  
    
      
      Avg_SR_Difference
      Avg_WPR_Difference
      Total_MVP_Difference
      Prev_Enc_Team1_WinPerc
      Total_RF_Difference
    
  
  
    
      398
      -9.646646
      0.466526
      6.0
      16.666667
      0.000000
    
    
      399
      4.963605
      0.097800
      12.0
      50.000000
      0.000000
    
    
      400
      7.079810
      0.432566
      11.0
      70.000000
      0.000000
    
    
      402
      21.485599
      1.176414
      17.0
      53.846154
      -100.000000
    
    
      403
      -4.503334
      1.663169
      15.0
      54.545455
      100.000000
    
    
      404
      -7.297630
      -0.332117
      -1.0
      72.727273
      -100.000000
    
    
      405
      12.183316
      2.316918
      5.0
      66.666667
      -50.000000
    
    
      407
      -5.341707
      2.620287
      12.0
      61.538462
      50.000000
    
    
      408
      5.093091
      0.588349
      20.0
      54.545455
      -50.000000
    
    
      410
      -13.668459
      -2.328697
      0.0
      60.000000
      -66.666667
    
    
      411
      15.451031
      0.903107
      -2.0
      54.545455
      66.666667
    
    
      412
      16.852669
      -1.198669
      -19.0
      50.000000
      33.333333
    
    
      413
      -6.674135
      2.225351
      8.0
      50.000000
      -33.333333
    
    
      415
      -19.344665
      -2.002792
      4.0
      41.666667
      -66.666667
    
    
      418
      12.797760
      0.233431
      -7.0
      70.000000
      66.666667
    
    
      419
      7.351864
      4.156536
      -1.0
      58.333333
      100.000000
    
    
      420
      5.659837
      -2.136710
      4.0
      50.000000
      33.333333
    
    
      422
      7.195161
      -0.935038
      -8.0
      45.454545
      33.333333
    
    
      423
      -12.464962
      -2.197910
      -7.0
      30.769231
      -66.666667
    
    
      424
      -3.504686
      0.247855
      -1.0
      50.000000
      33.333333
    
    
      425
      5.508879
      -1.167825
      -9.0
      50.000000
      33.333333
    
    
      426
      8.303271
      1.050310
      -14.0
      36.363636
      -33.333333
    
    
      428
      35.077496
      1.280748
      -1.0
      61.538462
      66.666667
    
    
      430
      -7.544654
      0.752493
      -8.0
      56.250000
      0.000000
    
    
      431
      21.871322
      0.886981
      -11.0
      54.545455
      33.333333
    
    
      432
      0.003466
      -1.131751
      3.0
      50.000000
      -100.000000
    
    
      434
      11.859779
      1.592142
      -11.0
      35.714286
      33.333333
    
    
      435
      2.600434
      1.780807
      5.0
      54.545455
      0.000000
    
    
      437
      -8.157091
      -0.853403
      -7.0
      76.923077
      0.000000
    
    
      438
      13.456476
      2.440403
      -4.0
      53.846154
      66.666667
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      474
      -6.548623
      0.610484
      -4.0
      53.333333
      -100.000000
    
    
      475
      -11.068650
      -0.136905
      -22.0
      46.153846
      -33.333333
    
    
      478
      7.048118
      -1.246590
      22.0
      56.250000
      -16.666667
    
    
      479
      -0.596885
      0.020515
      -11.0
      50.000000
      0.000000
    
    
      480
      17.723755
      0.354974
      3.0
      53.846154
      66.666667
    
    
      482
      -5.061380
      -1.212153
      7.0
      53.846154
      33.333333
    
    
      483
      -5.004302
      0.906597
      -4.0
      41.666667
      33.333333
    
    
      485
      2.012543
      0.366985
      21.0
      57.142857
      66.666667
    
    
      487
      2.910151
      0.425559
      7.0
      61.538462
      -33.333333
    
    
      488
      14.275080
      0.459693
      15.0
      61.538462
      -33.333333
    
    
      489
      -13.536521
      0.097342
      5.0
      60.000000
      -33.333333
    
    
      490
      4.510188
      -0.570758
      7.0
      57.142857
      0.000000
    
    
      492
      -13.615487
      -0.737024
      -1.0
      46.666667
      66.666667
    
    
      493
      10.775508
      -0.057722
      7.0
      60.000000
      0.000000
    
    
      494
      4.569589
      -0.231439
      25.0
      58.823529
      0.000000
    
    
      496
      -18.726971
      -0.755737
      -14.0
      53.333333
      -33.333333
    
    
      497
      -5.574733
      0.886535
      5.0
      35.714286
      66.666667
    
    
      499
      5.014283
      1.972348
      17.0
      52.631579
      0.000000
    
    
      500
      6.415078
      0.254813
      -18.0
      40.000000
      -33.333333
    
    
      502
      -1.343308
      2.062729
      -3.0
      43.750000
      -33.333333
    
    
      503
      -5.738591
      -0.046456
      16.0
      62.500000
      33.333333
    
    
      505
      6.941454
      1.678318
      28.0
      73.333333
      33.333333
    
    
      506
      5.622383
      -1.324729
      -16.0
      60.000000
      -66.666667
    
    
      507
      -0.677689
      -0.313345
      -11.0
      66.666667
      33.333333
    
    
      509
      -0.716536
      1.824407
      -33.0
      42.857143
      0.000000
    
    
      510
      -3.303823
      -0.271935
      -16.0
      50.000000
      0.000000
    
    
      513
      6.315981
      -0.617777
      -24.0
      50.000000
      0.000000
    
    
      514
      -2.200375
      0.969143
      5.0
      50.000000
      0.000000
    
    
      515
      -0.521025
      1.039181
      -23.0
      38.888889
      33.333333
    
    
      516
      -1.575550
      -1.707931
      -24.0
      52.380952
      0.000000
    
  

87 rows × 5 columns



In [113]:

    
tuned_parameters = {
    "n_estimators": [ 100 ],
    "max_depth" : [ 3 ],
    "learning_rate": [ 0.1 ],
}
gbc = ensemble.GradientBoostingClassifier()
clf = GridSearchCV(spark.sparkContext, gbc, tuned_parameters)
clf









    Out[113]:





GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'learning_rate': [0.1], 'max_depth': [3]},
       pre_dispatch='2*n_jobs', refit=True,
       sc=<pyspark.context.SparkContext object at 0x7fc96c0d3dd0>,
       scoring=None, verbose=0)



In [114]:

    
clf.fit(X_timetrain, Y_timetrain_arr)
clftest_pred = clf.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clftest_pred) *100, "%"









    



Accuracy is  50.5747126437 %



In [115]:

    
knn1 = KNeighborsClassifier()
knn_params = {
    "n_neighbors": [31]
}
clf2 = GridSearchCV(spark.sparkContext, knn1, knn_params, n_jobs = 2)
clf2









    Out[115]:





GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=2, param_grid={'n_neighbors': [31]},
       pre_dispatch='2*n_jobs', refit=True,
       sc=<pyspark.context.SparkContext object at 0x7fc96c0d3dd0>,
       scoring=None, verbose=0)



In [116]:

    
clf2.fit(X_timetrain, Y_timetrain_arr)
clf2test_pred = clf2.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clf2test_pred) *100, "%"









    



Accuracy is  64.367816092 %

	id	season	city	date	team1	team2	toss_winner	toss_decision	result	...	venue	umpire1	umpire2	umpire3	Avg_SR_Difference	Avg_WPR_Difference	Total_MVP_Difference	Prev_Enc_Team1_WinPerc	Total_RF_Difference	team1Winning
510	511	2015	Mumbai	2015-05-16	Rajasthan Royals	Kolkata Knight Riders	Rajasthan Royals	bat	normal	...	Brabourne Stadium	RM Deshpande	RK Illingworth	NaN	-3.303823	-0.271935	-16	50.000000	0.000000	1
513	514	2015	Mumbai	2015-05-19	Mumbai Indians	Chennai Super Kings	Mumbai Indians	bat	normal	...	Wankhede Stadium	HDPK Dharmasena	RK Illingworth	NaN	6.315981	-0.617777	-24	50.000000	0.000000	1
514	515	2015	Pune	2015-05-20	Royal Challengers Bangalore	Rajasthan Royals	Royal Challengers Bangalore	bat	normal	...	Maharashtra Cricket Association Stadium	AK Chaudhary	C Shamshuddin	NaN	-2.200375	0.969143	5	50.000000	0.000000	1
515	516	2015	Ranchi	2015-05-22	Royal Challengers Bangalore	Chennai Super Kings	Chennai Super Kings	field	normal	...	JSCA International Stadium Complex	AK Chaudhary	CB Gaffaney	NaN	-0.521025	1.039181	-23	38.888889	33.333333	0
516	517	2015	Kolkata	2015-05-24	Mumbai Indians	Chennai Super Kings	Chennai Super Kings	field	normal	...	Eden Gardens	HDPK Dharmasena	RK Illingworth	NaN	-1.575550	-1.707931	-24	52.380952	0.000000	1

	Avg_SR_Difference	Avg_WPR_Difference	Total_MVP_Difference	Prev_Enc_Team1_WinPerc	Total_RF_Difference
510	-3.303823	-0.271935	-16.0	50.000000	0.000000
513	6.315981	-0.617777	-24.0	50.000000	0.000000
514	-2.200375	0.969143	5.0	50.000000	0.000000
515	-0.521025	1.039181	-23.0	38.888889	33.333333
516	-1.575550	-1.707931	-24.0	52.380952	0.000000

	Avg_SR_Difference	Avg_WPR_Difference	Total_MVP_Difference	Prev_Enc_Team1_WinPerc	Total_RF_Difference
398	-9.646646	0.466526	6.0	16.666667	0.000000
399	4.963605	0.097800	12.0	50.000000	0.000000
400	7.079810	0.432566	11.0	70.000000	0.000000
402	21.485599	1.176414	17.0	53.846154	-100.000000
403	-4.503334	1.663169	15.0	54.545455	100.000000
404	-7.297630	-0.332117	-1.0	72.727273	-100.000000
405	12.183316	2.316918	5.0	66.666667	-50.000000
407	-5.341707	2.620287	12.0	61.538462	50.000000
408	5.093091	0.588349	20.0	54.545455	-50.000000
410	-13.668459	-2.328697	0.0	60.000000	-66.666667
411	15.451031	0.903107	-2.0	54.545455	66.666667
412	16.852669	-1.198669	-19.0	50.000000	33.333333
413	-6.674135	2.225351	8.0	50.000000	-33.333333
415	-19.344665	-2.002792	4.0	41.666667	-66.666667
418	12.797760	0.233431	-7.0	70.000000	66.666667
419	7.351864	4.156536	-1.0	58.333333	100.000000
420	5.659837	-2.136710	4.0	50.000000	33.333333
422	7.195161	-0.935038	-8.0	45.454545	33.333333
423	-12.464962	-2.197910	-7.0	30.769231	-66.666667
424	-3.504686	0.247855	-1.0	50.000000	33.333333
425	5.508879	-1.167825	-9.0	50.000000	33.333333
426	8.303271	1.050310	-14.0	36.363636	-33.333333
428	35.077496	1.280748	-1.0	61.538462	66.666667
430	-7.544654	0.752493	-8.0	56.250000	0.000000
431	21.871322	0.886981	-11.0	54.545455	33.333333
432	0.003466	-1.131751	3.0	50.000000	-100.000000
434	11.859779	1.592142	-11.0	35.714286	33.333333
435	2.600434	1.780807	5.0	54.545455	0.000000
437	-8.157091	-0.853403	-7.0	76.923077	0.000000
438	13.456476	2.440403	-4.0	53.846154	66.666667
...	...	...	...	...	...
474	-6.548623	0.610484	-4.0	53.333333	-100.000000
475	-11.068650	-0.136905	-22.0	46.153846	-33.333333
478	7.048118	-1.246590	22.0	56.250000	-16.666667
479	-0.596885	0.020515	-11.0	50.000000	0.000000
480	17.723755	0.354974	3.0	53.846154	66.666667
482	-5.061380	-1.212153	7.0	53.846154	33.333333
483	-5.004302	0.906597	-4.0	41.666667	33.333333
485	2.012543	0.366985	21.0	57.142857	66.666667
487	2.910151	0.425559	7.0	61.538462	-33.333333
488	14.275080	0.459693	15.0	61.538462	-33.333333
489	-13.536521	0.097342	5.0	60.000000	-33.333333
490	4.510188	-0.570758	7.0	57.142857	0.000000
492	-13.615487	-0.737024	-1.0	46.666667	66.666667
493	10.775508	-0.057722	7.0	60.000000	0.000000
494	4.569589	-0.231439	25.0	58.823529	0.000000
496	-18.726971	-0.755737	-14.0	53.333333	-33.333333
497	-5.574733	0.886535	5.0	35.714286	66.666667
499	5.014283	1.972348	17.0	52.631579	0.000000
500	6.415078	0.254813	-18.0	40.000000	-33.333333
502	-1.343308	2.062729	-3.0	43.750000	-33.333333
503	-5.738591	-0.046456	16.0	62.500000	33.333333
505	6.941454	1.678318	28.0	73.333333	33.333333
506	5.622383	-1.324729	-16.0	60.000000	-66.666667
507	-0.677689	-0.313345	-11.0	66.666667	33.333333
509	-0.716536	1.824407	-33.0	42.857143	0.000000
510	-3.303823	-0.271935	-16.0	50.000000	0.000000
513	6.315981	-0.617777	-24.0	50.000000	0.000000
514	-2.200375	0.969143	5.0	50.000000	0.000000
515	-0.521025	1.039181	-23.0	38.888889	33.333333
516	-1.575550	-1.707931	-24.0	52.380952	0.000000