In [ ]:
In [ ]:
In [ ]:
df_max_BetDate = df_bc.groupby(['UserId'])['BetDate'].max().reset_index()
df_max_BetDate.columns = ['UserId', 'max_BetDate']
df_min_FirstDepositDate = df_bc.groupby(['UserId'])['FirstDepositDate'].min().reset_index()
df_min_FirstDepositDate.columns = ['UserId', 'min_FirstDepositDate']
df_ltv = pd.merge(df_bc,
df_max_BetDate,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_ltv = pd.merge(df_ltv,
df_min_FirstDepositDate,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_ltv['lifetime'] = df_ltv['max_BetDate'] - df_ltv['min_FirstDepositDate']
df_ltv.head()
In [ ]:
df_Value = df_bc.groupby(['UserId'])['BetAmount'].sum().reset_index()
df_Value.columns = ['UserId', 'Value']
df_ltv = pd.merge(df_ltv,
df_Value,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_ltv.head()
In [ ]:
# value/lifetime
df_ltv['lifetime_int'] = df_ltv['lifetime'].astype('<m8[D]')
# turn 1 the 0 lifetime
df_ltv['lifetime_int'] = np.where(df_ltv['lifetime_int'] == 0, 1, df_ltv['lifetime_int'])
df_ltv['Value_lifetime'] = df_ltv['Value'] / df_ltv['lifetime_int']
In [ ]:
# age
now = pd.Timestamp(datetime.now())
df_ltv['age'] = (now - df_bc['BirthDate']).astype('<m8[Y]')
In [ ]:
df_ltv['lifetime_int'] = df_ltv['lifetime'].dt.days
In [ ]:
from datetime import timedelta
In [ ]:
df_ltv_15 = df_ltv[df_ltv['BetDate'] <= (df_ltv['FirstDepositDate'] + timedelta(days=15))]
In [ ]:
mean_bet_by_user = df_ltv_15.groupby('UserId')['BetAmount'].mean().reset_index()
mean_bet_by_user.columns = ['UserId', 'mean_bet']
min_bet_by_user = df_ltv_15.groupby('UserId')['BetAmount'].min().reset_index()
min_bet_by_user.columns = ['UserId', 'min_bet']
max_bet_by_user = df_ltv_15.groupby('UserId')['BetAmount'].max().reset_index()
max_bet_by_user.columns = ['UserId', 'max_bet']
median_bet_by_user = df_ltv_15.groupby('UserId')['BetAmount'].median().reset_index()
median_bet_by_user.columns = ['UserId', 'median_bet']
sum_bet_by_user = df_ltv_15.groupby('UserId')['BetAmount'].sum().reset_index()
sum_bet_by_user.columns = ['UserId', 'sum_bet']
count_bet_by_user = df_ltv_15.groupby('UserId')['BetId'].count().reset_index()
count_bet_by_user.columns = ['UserId', 'count_bet']
In [ ]:
mean_bet_won_by_user = df_ltv_15.groupby('UserId')['AmountWon'].mean().reset_index()
mean_bet_won_by_user.columns = ['UserId', 'mean_bet_won']
min_bet_won_by_user = df_ltv_15.groupby('UserId')['AmountWon'].min().reset_index()
min_bet_won_by_user.columns = ['UserId', 'min_bet_won']
max_bet_won_by_user = df_ltv_15.groupby('UserId')['AmountWon'].max().reset_index()
max_bet_won_by_user.columns = ['UserId', 'max_bet_won']
median_bet_won_by_user = df_ltv_15.groupby('UserId')['AmountWon'].median().reset_index()
median_bet_won_by_user.columns = ['UserId', 'median_bet_won']
sum_bet_won_by_user = df_ltv_15.groupby('UserId')['AmountWon'].sum().reset_index()
sum_bet_won_by_user.columns = ['UserId', 'sum_bet_won']
count_bet_won_by_user = df_ltv_15[df_ltv_15['AmountWon']>0].groupby('UserId')['BetId'].count().reset_index()
count_bet_won_by_user.columns = ['UserId', 'count_bet_won']
In [ ]:
application_used = pd.get_dummies(df_ltv_15[['UserId', 'Application']]).groupby('UserId').sum()
In [ ]:
mean_ods_by_user = df_ltv_15.groupby('UserId')['Odds'].mean()
In [ ]:
islive_by_user = df_ltv_15.groupby('UserId')['IsLive'].max()
In [ ]:
partnertype_used = pd.get_dummies(df_ltv_15[['UserId', 'PartnerType']]).groupby('UserId').sum()
In [ ]:
gender_used = pd.get_dummies(df_ltv_15[['UserId', 'Gender']]).groupby('UserId').sum()
gender_used = gender_used / gender_used
gender_used = gender_used.fillna(0)
In [ ]:
Country_used = pd.get_dummies(df_ltv_15[['UserId', 'Country']]).groupby('UserId').sum()
Country_used = Country_used / Country_used
Country_used = Country_used.fillna(0)
In [ ]:
df_res = pd.merge(mean_bet_by_user,
min_bet_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
max_bet_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
median_bet_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
sum_bet_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
count_bet_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
mean_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
min_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
max_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
median_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
sum_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
count_bet_won_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
application_used,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
mean_ods_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
islive_by_user,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
partnertype_used,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
gender_used,
how='left',
left_on=['UserId'],
right_on=['UserId'])
df_res = pd.merge(df_res,
Country_used,
how='left',
left_on=['UserId'],
right_on=['UserId'])
In [ ]:
df_res['ratio_win'] = df_res['count_bet_won'] / df_res['count_bet']
In [ ]:
df_res = df_res.fillna(0)
In [ ]:
ltv_to_evaluation = df_ltv[['UserId', 'Value_lifetime']].drop_duplicates()
In [ ]:
df_final = pd.merge(df_res,
ltv_to_evaluation,
how='left',
left_on=['UserId'],
right_on=['UserId'])
In [ ]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
dfTest = scaler.fit_transform(df_final[list(set(df_final).difference({'Value_lifetime'}))])
dfTest
In [ ]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X = dfTest
# y = 1 * x_0 + 2 * x_1 + 3
y = df_final['Value_lifetime']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
reg.score(X, y)
y_pred = reg.predict(X_test)
# The coefficients
print('Coefficients: \n', reg.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
In [ ]:
from sklearn import svm
X = dfTest
# y = 1 * x_0 + 2 * x_1 + 3
y = df_final['Value_lifetime']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
clf = svm.SVR()
clf.fit(X_train, y_train)
clf.score(X_train, y_train)
y_pred = clf.predict(X_test)
# The coefficients
#print('Coefficients: \n', clf.coef_)
# The mean squared error
#print("Mean squared error: %.2f"
#% mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))
In [ ]:
print('Variance score: %.2f' % r2_score(y_test, y_pred))
In [ ]: