Predicting the Outcome of Cricket Matches


In [2]:
%matplotlib inline 
import numpy as np # imports a fast numerical programming library
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from patsy import dmatrices

In [3]:
matches = pd.read_csv("../data/matcheswithfeatures.csv", index_col = 0)

In [4]:
y, X = dmatrices('team1Winning ~ 0 + Avg_SR_Difference + Avg_WPR_Difference + Total_MVP_Difference + Prev_Enc_Team1_WinPerc + \
                  Total_RF_Difference', matches, return_type="dataframe")
y_arr = np.ravel(y)

In [5]:
X


Out[5]:
Avg_SR_Difference Avg_WPR_Difference Total_MVP_Difference Prev_Enc_Team1_WinPerc Total_RF_Difference
5 55.665975 1.414786 0.0 0.000000 0.000000
7 6.135734 -1.591368 1.0 0.000000 100.000000
9 4.666844 0.111379 0.0 0.000000 0.000000
10 25.388743 -0.021123 0.0 0.000000 0.000000
11 -28.438618 11.723738 0.0 0.000000 0.000000
13 41.221731 6.066625 0.0 0.000000 66.666667
14 37.233069 0.581470 2.0 0.000000 66.666667
15 -13.582248 1.010938 1.0 0.000000 50.000000
16 15.293648 2.058102 -1.0 0.000000 16.666667
18 40.069300 -2.720529 2.0 0.000000 33.333333
19 -6.529304 0.155223 0.0 0.000000 33.333333
21 60.276090 1.503388 1.0 0.000000 66.666667
22 -36.759577 -0.140660 -1.0 0.000000 -33.333333
23 -3.740886 -1.545548 -2.0 0.000000 -33.333333
24 -29.919482 -1.732469 -1.0 0.000000 -100.000000
26 -1.150869 1.681456 2.0 0.000000 33.333333
27 4.157345 0.525677 1.0 100.000000 33.333333
28 -11.720957 2.154708 -1.0 100.000000 0.000000
30 -12.795080 -3.123743 -2.0 100.000000 -100.000000
31 15.090854 -2.420466 0.0 100.000000 -66.666667
33 16.965741 0.337565 1.0 100.000000 -33.333333
34 -39.254979 -1.533140 -1.0 0.000000 -66.666667
35 -30.450448 -0.062459 -2.0 0.000000 33.333333
36 9.397768 1.087380 0.0 100.000000 -33.333333
37 5.965139 0.737928 0.0 0.000000 0.000000
39 -20.297425 10.328566 -2.0 0.000000 -33.333333
41 -1.229378 2.296430 5.0 100.000000 33.333333
44 -19.002114 -1.367688 -3.0 0.000000 0.000000
45 -9.501326 -1.699759 -5.0 0.000000 0.000000
46 17.779892 1.135235 -2.0 100.000000 -33.333333
... ... ... ... ... ...
474 -6.548623 0.610484 -4.0 53.333333 -100.000000
475 -11.068650 -0.136905 -22.0 46.153846 -33.333333
478 7.048118 -1.246590 22.0 56.250000 -16.666667
479 -0.596885 0.020515 -11.0 50.000000 0.000000
480 17.723755 0.354974 3.0 53.846154 66.666667
482 -5.061380 -1.212153 7.0 53.846154 33.333333
483 -5.004302 0.906597 -4.0 41.666667 33.333333
485 2.012543 0.366985 21.0 57.142857 66.666667
487 2.910151 0.425559 7.0 61.538462 -33.333333
488 14.275080 0.459693 15.0 61.538462 -33.333333
489 -13.536521 0.097342 5.0 60.000000 -33.333333
490 4.510188 -0.570758 7.0 57.142857 0.000000
492 -13.615487 -0.737024 -1.0 46.666667 66.666667
493 10.775508 -0.057722 7.0 60.000000 0.000000
494 4.569589 -0.231439 25.0 58.823529 0.000000
496 -18.726971 -0.755737 -14.0 53.333333 -33.333333
497 -5.574733 0.886535 5.0 35.714286 66.666667
499 5.014283 1.972348 17.0 52.631579 0.000000
500 6.415078 0.254813 -18.0 40.000000 -33.333333
502 -1.343308 2.062729 -3.0 43.750000 -33.333333
503 -5.738591 -0.046456 16.0 62.500000 33.333333
505 6.941454 1.678318 28.0 73.333333 33.333333
506 5.622383 -1.324729 -16.0 60.000000 -66.666667
507 -0.677689 -0.313345 -11.0 66.666667 33.333333
509 -0.716536 1.824407 -33.0 42.857143 0.000000
510 -3.303823 -0.271935 -16.0 50.000000 0.000000
513 6.315981 -0.617777 -24.0 50.000000 0.000000
514 -2.200375 0.969143 5.0 50.000000 0.000000
515 -0.521025 1.039181 -23.0 38.888889 33.333333
516 -1.575550 -1.707931 -24.0 52.380952 0.000000

327 rows × 5 columns

Splitting Training Set (2008-2013) and Test Set (2013-2015) based on Seasons


In [6]:
X_timetrain = X.loc[X.index < 398]
Y_timetrain = y.loc[y.index < 398]
Y_timetrain_arr = np.ravel(Y_timetrain)
X_timetest = X.loc[X.index >= 398]
Y_timetest = y.loc[y.index >= 398]
Y_timetest_arr = np.ravel(Y_timetest)

In [7]:
# Best values of k in time-based split data
knn1 = KNeighborsClassifier(n_neighbors = 31)
knn1.fit(X_timetrain, Y_timetrain_arr)
y_pred = knn1.predict(X_timetest)
print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, y_pred)*100, "%"


Accuracy is  64.367816092 %

In [8]:
X_timetest


Out[8]:
Avg_SR_Difference Avg_WPR_Difference Total_MVP_Difference Prev_Enc_Team1_WinPerc Total_RF_Difference
398 -9.646646 0.466526 6.0 16.666667 0.000000
399 4.963605 0.097800 12.0 50.000000 0.000000
400 7.079810 0.432566 11.0 70.000000 0.000000
402 21.485599 1.176414 17.0 53.846154 -100.000000
403 -4.503334 1.663169 15.0 54.545455 100.000000
404 -7.297630 -0.332117 -1.0 72.727273 -100.000000
405 12.183316 2.316918 5.0 66.666667 -50.000000
407 -5.341707 2.620287 12.0 61.538462 50.000000
408 5.093091 0.588349 20.0 54.545455 -50.000000
410 -13.668459 -2.328697 0.0 60.000000 -66.666667
411 15.451031 0.903107 -2.0 54.545455 66.666667
412 16.852669 -1.198669 -19.0 50.000000 33.333333
413 -6.674135 2.225351 8.0 50.000000 -33.333333
415 -19.344665 -2.002792 4.0 41.666667 -66.666667
418 12.797760 0.233431 -7.0 70.000000 66.666667
419 7.351864 4.156536 -1.0 58.333333 100.000000
420 5.659837 -2.136710 4.0 50.000000 33.333333
422 7.195161 -0.935038 -8.0 45.454545 33.333333
423 -12.464962 -2.197910 -7.0 30.769231 -66.666667
424 -3.504686 0.247855 -1.0 50.000000 33.333333
425 5.508879 -1.167825 -9.0 50.000000 33.333333
426 8.303271 1.050310 -14.0 36.363636 -33.333333
428 35.077496 1.280748 -1.0 61.538462 66.666667
430 -7.544654 0.752493 -8.0 56.250000 0.000000
431 21.871322 0.886981 -11.0 54.545455 33.333333
432 0.003466 -1.131751 3.0 50.000000 -100.000000
434 11.859779 1.592142 -11.0 35.714286 33.333333
435 2.600434 1.780807 5.0 54.545455 0.000000
437 -8.157091 -0.853403 -7.0 76.923077 0.000000
438 13.456476 2.440403 -4.0 53.846154 66.666667
... ... ... ... ... ...
474 -6.548623 0.610484 -4.0 53.333333 -100.000000
475 -11.068650 -0.136905 -22.0 46.153846 -33.333333
478 7.048118 -1.246590 22.0 56.250000 -16.666667
479 -0.596885 0.020515 -11.0 50.000000 0.000000
480 17.723755 0.354974 3.0 53.846154 66.666667
482 -5.061380 -1.212153 7.0 53.846154 33.333333
483 -5.004302 0.906597 -4.0 41.666667 33.333333
485 2.012543 0.366985 21.0 57.142857 66.666667
487 2.910151 0.425559 7.0 61.538462 -33.333333
488 14.275080 0.459693 15.0 61.538462 -33.333333
489 -13.536521 0.097342 5.0 60.000000 -33.333333
490 4.510188 -0.570758 7.0 57.142857 0.000000
492 -13.615487 -0.737024 -1.0 46.666667 66.666667
493 10.775508 -0.057722 7.0 60.000000 0.000000
494 4.569589 -0.231439 25.0 58.823529 0.000000
496 -18.726971 -0.755737 -14.0 53.333333 -33.333333
497 -5.574733 0.886535 5.0 35.714286 66.666667
499 5.014283 1.972348 17.0 52.631579 0.000000
500 6.415078 0.254813 -18.0 40.000000 -33.333333
502 -1.343308 2.062729 -3.0 43.750000 -33.333333
503 -5.738591 -0.046456 16.0 62.500000 33.333333
505 6.941454 1.678318 28.0 73.333333 33.333333
506 5.622383 -1.324729 -16.0 60.000000 -66.666667
507 -0.677689 -0.313345 -11.0 66.666667 33.333333
509 -0.716536 1.824407 -33.0 42.857143 0.000000
510 -3.303823 -0.271935 -16.0 50.000000 0.000000
513 6.315981 -0.617777 -24.0 50.000000 0.000000
514 -2.200375 0.969143 5.0 50.000000 0.000000
515 -0.521025 1.039181 -23.0 38.888889 33.333333
516 -1.575550 -1.707931 -24.0 52.380952 0.000000

87 rows × 5 columns


In [21]:
def getPrediction(match_id):
    '''Returns the prediction for the given match
    
    Args: match_id (int): Match ID for the required game
    
    Returns: String: Predicted winner of the game and probability of victory 
    '''
    try:
        assert (399 <= match_id <= 517)
        results = {}
        match_row = matches.loc[matches['id'] == match_id]
        team1name = match_row.team1.unique()[0]
        team2name = match_row.team2.unique()[0]
        toPredict = X_timetest.loc[X_timetest.index == match_id-1].values
        prediction_prob = knn1.predict_proba(toPredict)
        prediction = knn1.predict(toPredict)
        if prediction[0] > 0:
            results['name'] = str(team1name)
            results['prob'] = float(prediction_prob[0][1])*100
        else:
            results['name'] = str(team2name)
            results['prob'] = float(prediction_prob[0][0])*100
        return results
    except AssertionError:
        return None;

In [23]:
print getPrediction(617)


None