In [1]:
import pandas
In [2]:
raw_data = pandas.read_csv('sports_betting/RawDataIIUserDailyAggregation.csv')
In [3]:
raw_data.head()
Out[3]:
In [4]:
import datetime
In [5]:
raw_data['Date'] = raw_data['Date'].apply(lambda x: datetime.datetime.strptime(str(x), '%Y%m%d'))
In [6]:
raw_data.head()
Out[6]:
In [7]:
raw_data['Amount'] = raw_data.apply(lambda x: int(x[4]) - int(x[3]), axis=1)
In [8]:
raw_data.head()
Out[8]:
In [9]:
raw_user_data = raw_data.groupby('UserID')
In [10]:
user_data = raw_user_data.aggregate(lambda x: list(x)).reset_index()
In [11]:
user_data.head()
Out[11]:
In [12]:
max_date = datetime.datetime.strptime('2005-05-24', '%Y-%m-%d')
user_data['churn'] = user_data['Date'].apply(lambda x: max(x) <= max_date)
In [13]:
user_data.head()
Out[13]:
In [14]:
def make_daily_list(row):
date_list = row[1]
amount_list = row[6]
items = []
for i, each_date in enumerate(date_list):
if each_date > max_date:
continue
items.append([
each_date, amount_list[i]
])
return items
In [15]:
user_data['activites'] = user_data.apply(make_daily_list, axis=1)
In [16]:
churn_data = user_data[['UserID', 'churn', 'activites']]
In [17]:
churn_data.head()
Out[17]:
In [18]:
churn_data['Total'] = churn_data['activites'].apply(lambda x: len(x))
In [19]:
churn_data['all_dates'] = churn_data['activites'].apply(lambda x: [each_date[0] for each_date in x])
In [20]:
churn_data['last_active'] = churn_data['all_dates'].apply(lambda x: (max_date - max(x)).days if x else 0)
In [21]:
churn_data.head()
Out[21]:
In [22]:
churn_data['total_wins'] = churn_data['activites'].apply(lambda x: len([i[1] for i in x if i[1] >0 ]) )
churn_data['total_loss'] = churn_data['activites'].apply(lambda x: len([i[1] for i in x if i[1] <0 ]) )
churn_data['amount_won'] = churn_data['activites'].apply(lambda x: sum([i[1] for i in x]) )
In [23]:
churn_data['active_days'] = churn_data['all_dates'].apply(lambda x: (max(x) - min(x)).days if x else 0)
In [24]:
churn_data.head()
Out[24]:
In [25]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(churn_data, train_size=0.8)
In [26]:
from sklearn import tree
In [27]:
features = ['Total', 'last_active', 'total_wins', 'total_loss', 'amount_won', 'active_days']
In [28]:
output = train_data['churn']
In [29]:
clf = tree.DecisionTreeClassifier()
In [30]:
clf.fit(train_data[features], output)
Out[30]:
In [31]:
predicted = clf.predict(test_data[features])
y_true = test_data[['churn']]
from sklearn import metrics
In [32]:
print(metrics.accuracy_score(y_true=y_true, y_pred=predicted))
print(metrics.confusion_matrix(y_true=y_true, y_pred=predicted))
In [33]:
clf.get_params
Out[33]:
In [34]:
from sklearn.linear_model import LogisticRegression
est = LogisticRegression()
est.fit(train_data[features], output)
Out[34]:
In [35]:
predicted_logistic = est.predict(test_data[features])
y_true_logistic = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_logistic, y_pred=predicted_logistic))
print(metrics.confusion_matrix(y_true=y_true_logistic, y_pred=predicted_logistic))
In [36]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(train_data[features], output)
Out[36]:
In [37]:
predicted_svm = svm_model.predict(test_data[features])
y_true_svm = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_svm, y_pred=predicted_svm))
print(metrics.confusion_matrix(y_true=y_true_svm, y_pred=predicted_svm))
In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=40)
knn_model.fit(train_data[features], output)
Out[60]:
In [61]:
predicted_knn = knn_model.predict(test_data[features])
y_true_knn = test_data[['churn']]
from sklearn import metrics
print("accuracy", metrics.accuracy_score(y_true=y_true_knn, y_pred=predicted_knn))
print(metrics.confusion_matrix(y_true=y_true_knn, y_pred=predicted_knn))
In [70]:
churn_data_csv = churn_data[features + ['churn', 'UserID']]
churn_data_csv.to_csv('churn_data.csv', sep=',')
In [4]:
new_df = raw_data.sort_index(by='UserID', ascending=1)
In [5]:
new_df.drop_duplicates(cols='UserI')
Out[5]:
In [ ]: