In [127]:
import pandas as pd
In [45]:
data = pd.read_csv('./ltv20160906_test.csv', delimiter='^')
In [46]:
data_80 = pd.read_csv('./ltv20160906_train.csv', delimiter='^')
In [47]:
data.columns
Out[47]:
In [50]:
df = pd.concat([data, data_80])
In [55]:
print data.shape, data_80.shape, df.shape
In [56]:
df['year'] = df['week'].apply(lambda y: int(y[0:4]))
df['week_n'] = df['week'].apply(lambda y: int(y[-2:]))
In [59]:
us = df[df.jobCountry=='US']
In [60]:
#x = data.groupby(['advertiserId']).sum()
In [64]:
us_clean = us[us.week!='2016-53']
In [75]:
us_clean = us_clean.reset_index(drop=True)
In [76]:
print us.shape, us_clean.shape
In [68]:
w = sorted(us_clean.week.unique())
print w, len(w)
In [77]:
type(us_clean['week'][0])
Out[77]:
In [118]:
complete_data = us_clean[us_clean['week'].isin(['2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10'])]
complete_data['cost'] = complete_data['costmillicent'].apply(lambda y: y/100000.)
complete_data = complete_data[complete_data['isDradisEmployer']==1]
In [119]:
z = complete_data.groupby(['advertiserId'])['cost'].sum()
In [120]:
import seaborn as sns
In [121]:
%matplotlib inline
In [122]:
#sns.distplot(z.tolist())
In [123]:
final = sorted(z.tolist())
In [128]:
import numpy as np
sns.boxplot(np.log10(final))
Out[128]:
In [129]:
np.median(final),np.mean(final)
Out[129]:
In [130]:
sns.distplot(np.log10(final))
len(final)
Out[130]:
In [ ]: