In [1]:
import pandas

In [2]:
raw_data = pandas.read_csv('sports_betting/RawDataIIUserDailyAggregation.csv')

In [3]:
raw_data.head()


Out[3]:
UserID Date ProductID Stakes Winnings Bets
0 1324354 20050224 1 20 0 2
1 1324354 20050225 1 0 0 0
2 1324354 20050227 1 20 0 2
3 1324354 20050303 1 10 0 1
4 1324354 20050304 1 10 0 1

In [4]:
import datetime

Convert date to Python format


In [5]:
raw_data['Date'] = raw_data['Date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))

In [6]:
raw_data.head()


Out[6]:
UserID Date ProductID Stakes Winnings Bets
0 1324354 2005-02-24 1 20 0 2
1 1324354 2005-02-25 1 0 0 0
2 1324354 2005-02-27 1 20 0 2
3 1324354 2005-03-03 1 10 0 1
4 1324354 2005-03-04 1 10 0 1

Determine the total amount based on winnings and Stakes


In [7]:
raw_data['Amount'] = raw_data.apply(lambda x: int(x[4]) - int(x[3]), axis=1)

In [8]:
raw_data.head()


Out[8]:
UserID Date ProductID Stakes Winnings Bets Amount
0 1324354 2005-02-24 1 20 0 2 -20
1 1324354 2005-02-25 1 0 0 0 0
2 1324354 2005-02-27 1 20 0 2 -20
3 1324354 2005-03-03 1 10 0 1 -10
4 1324354 2005-03-04 1 10 0 1 -10

Lets group the data by users


In [9]:
raw_user_data = raw_data.groupby('UserID')

In [10]:
user_data = raw_user_data.aggregate(lambda x: list(x)).reset_index()

In [11]:
user_data.head()


Out[11]:
UserID Date ProductID Stakes Winnings Bets Amount
0 1324354 [2005-02-24 00:00:00, 2005-02-25 00:00:00, 200... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [20.0, 0.0, 20.0, 10.0, 10.0, 10.0, 10.0, 20.0... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [2, 0, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 0, 1, 0, ... [-20, 0, -20, -10, -10, -10, -10, -20, -10, -1...
1 1324355 [2005-02-01 00:00:00, 2005-02-01 00:00:00, 200... [2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, ... [2.0, 3.5, 3.0, 6.7, 6.0, 6.0, 3.0, 6.0, 2.0, ... [0.0, 0.0, 0.0, 11.2, 0.0, 0.0, 0.0, 0.0, 0.0,... [2, 6, 6, 7, 5, 1, 4, 6, 2, 1, 1, 1, 1, 0, 0, ... [-2, -3, -3, 5, -6, -6, -3, -6, -2, 0, 0, -1, ...
2 1324356 [2005-02-02 00:00:00, 2005-02-02 00:00:00, 200... [1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, ... [4.0, 1.0, 1.0, 0.0, 19.0, 9.99, 1.0, 5.89, 15... [0.0, 0.0, 0.0, 0.0, 0.0, 14.99, 5.89, 0.0, 0.... [4, 2, 2, 0, 2, 2, 1, 2, 3, 2, 3, 3, 2, 0, 2, ... [-4, -1, -1, 0, -19, 5, 4, -5, -15, -1, 1, 0, ...
3 1324358 [2005-02-01 00:00:00, 2005-02-02 00:00:00, 200... [1, 1, 1, 1, 1, 1, 2, 1, 1] [73.451, 0.0, 47.2813, 30.4368, 0.0, 32.6108, ... [42.6632, 47.3561, 0.0, 0.0, 0.0, 0.0, 55.9819... [3, 0, 1, 1, 0, 1, 4, 0, 1] [-31, 47, -47, -30, 0, -32, -33, 63, -63]
4 1324360 [2005-02-02 00:00:00, 2005-02-03 00:00:00, 200... [1, 1, 8, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ... [1.7321, 2.8868, 4.0, 0.8661, 0.0, 0.0, 0.5774... [0.0, 0.0, 2.0, 0.0, 5.6293, 0.0, 0.0, 1.2009,... [1, 1, 4, 2, 0, 0, 1, 2, 5, 0, 4, 0, 0, 3, 0, ... [-1, -2, -2, 0, 5, 0, 0, 0, -3, 3, -2, 0, 0, -...

Determine if the user churns after 3 months.


In [12]:
max_date = datetime.datetime.strptime('2005-05-24', '%Y-%m-%d')
user_data['churn'] = user_data['Date'].apply(lambda x: max(x) <= max_date)

In [13]:
user_data.head()


Out[13]:
UserID Date ProductID Stakes Winnings Bets Amount churn
0 1324354 [2005-02-24 00:00:00, 2005-02-25 00:00:00, 200... [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... [20.0, 0.0, 20.0, 10.0, 10.0, 10.0, 10.0, 20.0... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [2, 0, 2, 1, 1, 1, 1, 3, 1, 1, 1, 1, 0, 1, 0, ... [-20, 0, -20, -10, -10, -10, -10, -20, -10, -1... False
1 1324355 [2005-02-01 00:00:00, 2005-02-01 00:00:00, 200... [2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, ... [2.0, 3.5, 3.0, 6.7, 6.0, 6.0, 3.0, 6.0, 2.0, ... [0.0, 0.0, 0.0, 11.2, 0.0, 0.0, 0.0, 0.0, 0.0,... [2, 6, 6, 7, 5, 1, 4, 6, 2, 1, 1, 1, 1, 0, 0, ... [-2, -3, -3, 5, -6, -6, -3, -6, -2, 0, 0, -1, ... False
2 1324356 [2005-02-02 00:00:00, 2005-02-02 00:00:00, 200... [1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, ... [4.0, 1.0, 1.0, 0.0, 19.0, 9.99, 1.0, 5.89, 15... [0.0, 0.0, 0.0, 0.0, 0.0, 14.99, 5.89, 0.0, 0.... [4, 2, 2, 0, 2, 2, 1, 2, 3, 2, 3, 3, 2, 0, 2, ... [-4, -1, -1, 0, -19, 5, 4, -5, -15, -1, 1, 0, ... False
3 1324358 [2005-02-01 00:00:00, 2005-02-02 00:00:00, 200... [1, 1, 1, 1, 1, 1, 2, 1, 1] [73.451, 0.0, 47.2813, 30.4368, 0.0, 32.6108, ... [42.6632, 47.3561, 0.0, 0.0, 0.0, 0.0, 55.9819... [3, 0, 1, 1, 0, 1, 4, 0, 1] [-31, 47, -47, -30, 0, -32, -33, 63, -63] True
4 1324360 [2005-02-02 00:00:00, 2005-02-03 00:00:00, 200... [1, 1, 8, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, ... [1.7321, 2.8868, 4.0, 0.8661, 0.0, 0.0, 0.5774... [0.0, 0.0, 2.0, 0.0, 5.6293, 0.0, 0.0, 1.2009,... [1, 1, 4, 2, 0, 0, 1, 2, 5, 0, 4, 0, 0, 3, 0, ... [-1, -2, -2, 0, 5, 0, 0, 0, -3, 3, -2, 0, 0, -... False

In [14]:
def make_daily_list(row):
    date_list = row[1]
    amount_list = row[6]
    items = []
    for i, each_date in enumerate(date_list):
        if each_date > max_date:
            continue
        items.append([
                each_date, amount_list[i]
            ])
    return items

In [15]:
user_data['activites'] = user_data.apply(make_daily_list, axis=1)

In [16]:
churn_data = user_data[['UserID', 'churn', 'activites']]

In [17]:
churn_data.head()


Out[17]:
UserID churn activites
0 1324354 False [[2005-02-24 00:00:00, -20], [2005-02-25 00:00...
1 1324355 False [[2005-02-01 00:00:00, -2], [2005-02-01 00:00:...
2 1324356 False [[2005-02-02 00:00:00, -4], [2005-02-02 00:00:...
3 1324358 True [[2005-02-01 00:00:00, -31], [2005-02-02 00:00...
4 1324360 False [[2005-02-02 00:00:00, -1], [2005-02-03 00:00:...

In [18]:
churn_data['Total'] = churn_data['activites'].apply(lambda x: len(x))


/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [19]:
churn_data['all_dates'] = churn_data['activites'].apply(lambda x: [each_date[0] for each_date in x])


/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [20]:
churn_data['last_active'] = churn_data['all_dates'].apply(lambda x: (max_date - max(x)).days if x else 0)


/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [21]:
churn_data.head()


Out[21]:
UserID churn activites Total all_dates last_active
0 1324354 False [[2005-02-24 00:00:00, -20], [2005-02-25 00:00... 83 [2005-02-24 00:00:00, 2005-02-25 00:00:00, 200... 2
1 1324355 False [[2005-02-01 00:00:00, -2], [2005-02-01 00:00:... 69 [2005-02-01 00:00:00, 2005-02-01 00:00:00, 200... 1
2 1324356 False [[2005-02-02 00:00:00, -4], [2005-02-02 00:00:... 46 [2005-02-02 00:00:00, 2005-02-02 00:00:00, 200... 52
3 1324358 True [[2005-02-01 00:00:00, -31], [2005-02-02 00:00... 9 [2005-02-01 00:00:00, 2005-02-02 00:00:00, 200... 18
4 1324360 False [[2005-02-02 00:00:00, -1], [2005-02-03 00:00:... 21 [2005-02-02 00:00:00, 2005-02-03 00:00:00, 200... 52

In [22]:
churn_data['total_wins'] = churn_data['activites'].apply(lambda x: len([i[1] for i in x if i[1] >0 ]) )
churn_data['total_loss'] = churn_data['activites'].apply(lambda x: len([i[1] for i in x if i[1] <0 ]) )

churn_data['amount_won'] = churn_data['activites'].apply(lambda x: sum([i[1] for i in x]) )


/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [23]:
churn_data['active_days'] = churn_data['all_dates'].apply(lambda x: (max(x) - min(x)).days if x else 0)

In [24]:
churn_data.head()


Out[24]:
UserID churn activites Total all_dates last_active total_wins total_loss amount_won active_days
0 1324354 False [[2005-02-24 00:00:00, -20], [2005-02-25 00:00... 83 [2005-02-24 00:00:00, 2005-02-25 00:00:00, 200... 2 29 47 -211 87
1 1324355 False [[2005-02-01 00:00:00, -2], [2005-02-01 00:00:... 69 [2005-02-01 00:00:00, 2005-02-01 00:00:00, 200... 1 13 43 118 111
2 1324356 False [[2005-02-02 00:00:00, -4], [2005-02-02 00:00:... 46 [2005-02-02 00:00:00, 2005-02-02 00:00:00, 200... 52 7 26 -137 59
3 1324358 True [[2005-02-01 00:00:00, -31], [2005-02-02 00:00... 9 [2005-02-01 00:00:00, 2005-02-02 00:00:00, 200... 18 2 6 -126 94
4 1324360 False [[2005-02-02 00:00:00, -1], [2005-02-03 00:00:... 21 [2005-02-02 00:00:00, 2005-02-03 00:00:00, 200... 52 4 9 -4 59

In [25]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(churn_data, train_size=0.8)

In [26]:
from sklearn import tree

In [27]:
features = ['Total',  'last_active', 'total_wins', 'total_loss', 'amount_won', 'active_days']

Try out with Decision Tree


In [28]:
output = train_data['churn']

In [29]:
clf = tree.DecisionTreeClassifier()

In [30]:
clf.fit(train_data[features], output)


Out[30]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [31]:
predicted = clf.predict(test_data[features])

y_true = test_data[['churn']]
from sklearn import metrics

In [32]:
print(metrics.accuracy_score(y_true=y_true, y_pred=predicted))
print(metrics.confusion_matrix(y_true=y_true, y_pred=predicted))


0.686010717136
[[3820 1387]
 [1367 2197]]

In [33]:
clf.get_params


Out[33]:
<bound method DecisionTreeClassifier.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')>

Lets try out with Logistic Regression


In [34]:
from sklearn.linear_model import LogisticRegression
est = LogisticRegression()
est.fit(train_data[features], output)


Out[34]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
predicted_logistic = est.predict(test_data[features])

y_true_logistic = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_logistic, y_pred=predicted_logistic))
print(metrics.confusion_matrix(y_true=y_true_logistic, y_pred=predicted_logistic))


accuracy 0.751339642002
[[4266  941]
 [1240 2324]]

Lets try with SVM


In [36]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(train_data[features], output)


Out[36]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
predicted_svm = svm_model.predict(test_data[features])

y_true_svm = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_svm, y_pred=predicted_svm))
print(metrics.confusion_matrix(y_true=y_true_svm, y_pred=predicted_svm))


accuracy 0.690001140121
[[4881  326]
 [2393 1171]]

Lets try out with Nearest Neighbour


In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=40)
knn_model.fit(train_data[features], output)


Out[60]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=40, p=2,
           weights='uniform')

In [61]:
predicted_knn = knn_model.predict(test_data[features])

y_true_knn = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_knn, y_pred=predicted_knn))
print(metrics.confusion_matrix(y_true=y_true_knn, y_pred=predicted_knn))


accuracy 0.758294379204
[[4152 1055]
 [1065 2499]]

In [70]:
churn_data_csv = churn_data[features + ['churn', 'UserID']]
churn_data_csv.to_csv('churn_data.csv', sep=',')

In [4]:
new_df = raw_data.sort_index(by='UserID', ascending=1)


/home/sudip/phunka/projects/freelance/forex/envforex/lib/python3.4/site-packages/ipykernel/__main__.py:1: FutureWarning: by argument to sort_index is deprecated, pls use .sort_values(by=...)
  if __name__ == '__main__':

In [5]:
new_df.drop_duplicates(cols='UserI')


Out[5]:
UserID Date ProductID Stakes Winnings Bets
0 1324354 20050224 1 20.00 0.00 2
100 1324354 20050817 1 42.48 0.00 2
99 1324354 20050816 1 50.00 92.48 2
98 1324354 20050815 1 176.90 0.00 1
97 1324354 20050814 2 145.00 176.90 1

In [ ]: