In [ ]:
import pandas as pd
import pandas_profiling
import matplotlib
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
In [213]:
df_bc = pd.read_csv('data/dataset_betclic.csv')
In [215]:
df_bc['BirthDate'] = pd.to_datetime(df_bc['BirthDate'], format='%Y-%m-%dT%H:%M:%S.%fZ')
df_bc['FirstDepositDate'] = pd.to_datetime(df_bc['FirstDepositDate'], format='%Y-%m-%dT%H:%M:%S.%fZ')
df_bc['BetDate'] = pd.to_datetime(df_bc['BetDate'], format='%Y%m%d')
Un outil parfait pour les EDA mais très verbeux :
pandas_profiling.ProfileReport(df_bc)
je prefere ne pas le lancer ici et explorer à la main
In [7]:
df_bc.groupby('Country')['UserId'].count().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[7]:
In [41]:
df_bc.groupby('Gender')['UserId'].count().sort_values(ascending=False).plot(kind='pie')
Out[41]:
In [67]:
df_bc['BirthDate'].hist(bins=100, figsize=(20, 5))
Out[67]:
In [68]:
df_bc.groupby('PartnerType')['UserId'].count().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[68]:
In [75]:
df_bc['FirstDepositDate'].hist(bins=100, figsize=(20, 5))
Out[75]:
In [70]:
df_bc['BetDate'].hist(bins=100, figsize=(20, 5))
Out[70]:
In [80]:
df_bc.groupby('BetDate')['BetId'].count().plot(kind='hist', bins=100, figsize=(20, 5))
Out[80]:
In [81]:
df_bc.groupby('Application')['UserId'].count().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[81]:
In [82]:
df_bc.groupby('ProductName')['UserId'].count().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[82]:
In [42]:
df_bc.groupby('IsLive')['UserId'].count().sort_values(ascending=True).plot(kind='pie', figsize=(20, 5))
Out[42]:
In [15]:
df_bc['BetAmount'].hist(bins=100, bottom=0.1, figsize=(20, 5))
Out[15]:
il est impossible de voir quoi que ce soit ici.
Un changement de referentiel permet d'observer la distribution des bets amount
In [12]:
fig, ax = plt.subplots()
df_bc['BetAmount'].hist(ax=ax, bins=100, bottom=0.1, figsize=(20, 5))
ax.set_yscale('log')
In [90]:
fig, ax = plt.subplots()
df_bc['AmountWon'].hist(ax=ax, bins=100, bottom=0.1, figsize=(20, 5))
ax.set_yscale('log')
In [92]:
fig, ax = plt.subplots()
df_bc['Odds'].hist(ax=ax, bins=100, bottom=0.1, figsize=(20, 5))
ax.set_yscale('log')
In [94]:
fig, ax = plt.subplots()
df_bc['Cashout'].hist(ax=ax, bins=100, bottom=0.1, figsize=(20, 5))
ax.set_yscale('log')
In [107]:
(df_bc.drop_duplicates(subset='UserId')['BirthDate'] - df_bc.drop_duplicates(subset='UserId')['BirthDate'].min()).mean() + df_bc.drop_duplicates(subset='UserId')['BirthDate'].min()
Out[107]:
In [220]:
now = pd.Timestamp(datetime.now())
(now - df_bc.drop_duplicates(subset='UserId')['BirthDate']).astype('<m8[Y]').mean()
Out[220]:
In [21]:
(df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='F']['BirthDate'] - df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='F']['BirthDate'].min()).mean() + df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='F']['BirthDate'].min()
Out[21]:
In [22]:
(df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='M']['BirthDate'] - df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='M']['BirthDate'].min()).mean() + df_bc.drop_duplicates(subset='UserId')[df_bc['Gender']=='M']['BirthDate'].min()
Out[22]:
In [18]:
df_bc.groupby('UserId')['BetId'].count().mean()
Out[18]:
In [121]:
df_bc.groupby(['UserId', 'BetId'])['BetAmount'].sum().mean()
Out[121]:
In [122]:
df_bc.groupby(['UserId', 'BetId'])['AmountWon'].sum().mean()
Out[122]:
In [126]:
df_bc.drop_duplicates(subset='BetId')['BetAmount'].sum()
Out[126]:
In [127]:
df_bc.drop_duplicates(subset='BetId')['AmountWon'].sum()
Out[127]:
In [125]:
df_bc.drop_duplicates(subset='BetId')['BetAmount'].sum() - df_bc.drop_duplicates(subset='BetId')['AmountWon'].sum()
Out[125]:
In [129]:
df_bc.groupby(['BetDate'])['BetId'].count().mean()
Out[129]:
In [143]:
df_bc.groupby(['Gender', 'BetId'])['AmountWon'].sum().reset_index().groupby('Gender')['AmountWon'].mean()
Out[143]:
In [152]:
df_bc.groupby(['Country', 'BetId'])['AmountWon'].sum().reset_index().groupby('Country')['AmountWon'].mean().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[152]:
In [153]:
df_bc.groupby(['Country'])['BetId'].sum().sort_values(ascending=True).plot(kind='barh', figsize=(20, 5))
Out[153]: