In [106]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark import SparkConf

In [107]:
import pandas as pd
import numpy as np
from sklearn import grid_search, datasets
from spark_sklearn import GridSearchCV
from sklearn import ensemble
from pyspark.sql import SparkSession
from spark_sklearn.util import createLocalSparkSession
from patsy import dmatrices
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [108]:
df = pd.read_csv("../data/matcheswithfeatures.csv", index_col = 0)
df.tail()


Out[108]:
id season city date team1 team2 toss_winner toss_decision result dl_applied ... venue umpire1 umpire2 umpire3 Avg_SR_Difference Avg_WPR_Difference Total_MVP_Difference Prev_Enc_Team1_WinPerc Total_RF_Difference team1Winning
510 511 2015 Mumbai 2015-05-16 Rajasthan Royals Kolkata Knight Riders Rajasthan Royals bat normal 0 ... Brabourne Stadium RM Deshpande RK Illingworth NaN -3.303823 -0.271935 -16 50.000000 0.000000 1
513 514 2015 Mumbai 2015-05-19 Mumbai Indians Chennai Super Kings Mumbai Indians bat normal 0 ... Wankhede Stadium HDPK Dharmasena RK Illingworth NaN 6.315981 -0.617777 -24 50.000000 0.000000 1
514 515 2015 Pune 2015-05-20 Royal Challengers Bangalore Rajasthan Royals Royal Challengers Bangalore bat normal 0 ... Maharashtra Cricket Association Stadium AK Chaudhary C Shamshuddin NaN -2.200375 0.969143 5 50.000000 0.000000 1
515 516 2015 Ranchi 2015-05-22 Royal Challengers Bangalore Chennai Super Kings Chennai Super Kings field normal 0 ... JSCA International Stadium Complex AK Chaudhary CB Gaffaney NaN -0.521025 1.039181 -23 38.888889 33.333333 0
516 517 2015 Kolkata 2015-05-24 Mumbai Indians Chennai Super Kings Chennai Super Kings field normal 0 ... Eden Gardens HDPK Dharmasena RK Illingworth NaN -1.575550 -1.707931 -24 52.380952 0.000000 1

5 rows × 24 columns


In [109]:
spark = createLocalSparkSession()

In [110]:
y, X = dmatrices('team1Winning ~ 0 + Avg_SR_Difference + Avg_WPR_Difference + Total_MVP_Difference + Prev_Enc_Team1_WinPerc + \
                  Total_RF_Difference', df, return_type="dataframe")
y_arr = np.ravel(y)

In [111]:
X.tail()


Out[111]:
Avg_SR_Difference Avg_WPR_Difference Total_MVP_Difference Prev_Enc_Team1_WinPerc Total_RF_Difference
510 -3.303823 -0.271935 -16.0 50.000000 0.000000
513 6.315981 -0.617777 -24.0 50.000000 0.000000
514 -2.200375 0.969143 5.0 50.000000 0.000000
515 -0.521025 1.039181 -23.0 38.888889 33.333333
516 -1.575550 -1.707931 -24.0 52.380952 0.000000

In [112]:
X_timetrain = X.loc[X.index < 398]
Y_timetrain = y.loc[y.index < 398]
Y_timetrain_arr = np.ravel(Y_timetrain)
X_timetest = X.loc[X.index >= 398]
Y_timetest = y.loc[y.index >= 398]
Y_timetest_arr = np.ravel(Y_timetest)
X_timetest


Out[112]:
Avg_SR_Difference Avg_WPR_Difference Total_MVP_Difference Prev_Enc_Team1_WinPerc Total_RF_Difference
398 -9.646646 0.466526 6.0 16.666667 0.000000
399 4.963605 0.097800 12.0 50.000000 0.000000
400 7.079810 0.432566 11.0 70.000000 0.000000
402 21.485599 1.176414 17.0 53.846154 -100.000000
403 -4.503334 1.663169 15.0 54.545455 100.000000
404 -7.297630 -0.332117 -1.0 72.727273 -100.000000
405 12.183316 2.316918 5.0 66.666667 -50.000000
407 -5.341707 2.620287 12.0 61.538462 50.000000
408 5.093091 0.588349 20.0 54.545455 -50.000000
410 -13.668459 -2.328697 0.0 60.000000 -66.666667
411 15.451031 0.903107 -2.0 54.545455 66.666667
412 16.852669 -1.198669 -19.0 50.000000 33.333333
413 -6.674135 2.225351 8.0 50.000000 -33.333333
415 -19.344665 -2.002792 4.0 41.666667 -66.666667
418 12.797760 0.233431 -7.0 70.000000 66.666667
419 7.351864 4.156536 -1.0 58.333333 100.000000
420 5.659837 -2.136710 4.0 50.000000 33.333333
422 7.195161 -0.935038 -8.0 45.454545 33.333333
423 -12.464962 -2.197910 -7.0 30.769231 -66.666667
424 -3.504686 0.247855 -1.0 50.000000 33.333333
425 5.508879 -1.167825 -9.0 50.000000 33.333333
426 8.303271 1.050310 -14.0 36.363636 -33.333333
428 35.077496 1.280748 -1.0 61.538462 66.666667
430 -7.544654 0.752493 -8.0 56.250000 0.000000
431 21.871322 0.886981 -11.0 54.545455 33.333333
432 0.003466 -1.131751 3.0 50.000000 -100.000000
434 11.859779 1.592142 -11.0 35.714286 33.333333
435 2.600434 1.780807 5.0 54.545455 0.000000
437 -8.157091 -0.853403 -7.0 76.923077 0.000000
438 13.456476 2.440403 -4.0 53.846154 66.666667
... ... ... ... ... ...
474 -6.548623 0.610484 -4.0 53.333333 -100.000000
475 -11.068650 -0.136905 -22.0 46.153846 -33.333333
478 7.048118 -1.246590 22.0 56.250000 -16.666667
479 -0.596885 0.020515 -11.0 50.000000 0.000000
480 17.723755 0.354974 3.0 53.846154 66.666667
482 -5.061380 -1.212153 7.0 53.846154 33.333333
483 -5.004302 0.906597 -4.0 41.666667 33.333333
485 2.012543 0.366985 21.0 57.142857 66.666667
487 2.910151 0.425559 7.0 61.538462 -33.333333
488 14.275080 0.459693 15.0 61.538462 -33.333333
489 -13.536521 0.097342 5.0 60.000000 -33.333333
490 4.510188 -0.570758 7.0 57.142857 0.000000
492 -13.615487 -0.737024 -1.0 46.666667 66.666667
493 10.775508 -0.057722 7.0 60.000000 0.000000
494 4.569589 -0.231439 25.0 58.823529 0.000000
496 -18.726971 -0.755737 -14.0 53.333333 -33.333333
497 -5.574733 0.886535 5.0 35.714286 66.666667
499 5.014283 1.972348 17.0 52.631579 0.000000
500 6.415078 0.254813 -18.0 40.000000 -33.333333
502 -1.343308 2.062729 -3.0 43.750000 -33.333333
503 -5.738591 -0.046456 16.0 62.500000 33.333333
505 6.941454 1.678318 28.0 73.333333 33.333333
506 5.622383 -1.324729 -16.0 60.000000 -66.666667
507 -0.677689 -0.313345 -11.0 66.666667 33.333333
509 -0.716536 1.824407 -33.0 42.857143 0.000000
510 -3.303823 -0.271935 -16.0 50.000000 0.000000
513 6.315981 -0.617777 -24.0 50.000000 0.000000
514 -2.200375 0.969143 5.0 50.000000 0.000000
515 -0.521025 1.039181 -23.0 38.888889 33.333333
516 -1.575550 -1.707931 -24.0 52.380952 0.000000

87 rows × 5 columns


In [113]:
tuned_parameters = {
    "n_estimators": [ 100 ],
    "max_depth" : [ 3 ],
    "learning_rate": [ 0.1 ],
}
gbc = ensemble.GradientBoostingClassifier()
clf = GridSearchCV(spark.sparkContext, gbc, tuned_parameters)
clf


Out[113]:
GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100], 'learning_rate': [0.1], 'max_depth': [3]},
       pre_dispatch='2*n_jobs', refit=True,
       sc=<pyspark.context.SparkContext object at 0x7fc96c0d3dd0>,
       scoring=None, verbose=0)

In [114]:
clf.fit(X_timetrain, Y_timetrain_arr)
clftest_pred = clf.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clftest_pred) *100, "%"


Accuracy is  50.5747126437 %

In [115]:
knn1 = KNeighborsClassifier()
knn_params = {
    "n_neighbors": [31]
}
clf2 = GridSearchCV(spark.sparkContext, knn1, knn_params, n_jobs = 2)
clf2


Out[115]:
GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=2, param_grid={'n_neighbors': [31]},
       pre_dispatch='2*n_jobs', refit=True,
       sc=<pyspark.context.SparkContext object at 0x7fc96c0d3dd0>,
       scoring=None, verbose=0)

In [116]:
clf2.fit(X_timetrain, Y_timetrain_arr)
clf2test_pred = clf2.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clf2test_pred) *100, "%"


Accuracy is  64.367816092 %