notebook.community

Edit and run



In [127]:

    
import pandas as pd



In [45]:

    
data = pd.read_csv('./ltv20160906_test.csv', delimiter='^')



In [46]:

    
data_80 = pd.read_csv('./ltv20160906_train.csv', delimiter='^')



In [47]:

    
data.columns









    Out[47]:





Index([u'advertiserId', u'isDradisEmployer', u'salesTouched', u'week',
       u'costmillicent', u'jobCountry', u'company', u'employeeCount',
       u'dateCreated', u'firstRevenueDate', u'lastRevenueDate', u'isAdAgency',
       u'industry', u'type', u'state', u'city', u'conversionTime',
       u'hasDradisJob', u'sid', u'kw', u'outgoingEmail', u'outgoingCall',
       u'advertiserAgeWeek', u'spendAgeWeek', u'spendAgeWeek_c',
       u'advertiserAgeWeek_c', u'conversionTime_c', u'sales_touch', u'source'],
      dtype='object')



In [50]:

    
df = pd.concat([data, data_80])



In [55]:

    
print data.shape, data_80.shape, df.shape









    



(619500, 29) (2450549, 29) (3070049, 29)



In [56]:

    
df['year'] = df['week'].apply(lambda y: int(y[0:4]))
df['week_n'] = df['week'].apply(lambda y: int(y[-2:]))



In [59]:

    
us = df[df.jobCountry=='US']



In [60]:

    
#x = data.groupby(['advertiserId']).sum()



In [64]:

    
us_clean = us[us.week!='2016-53']



In [75]:

    
us_clean = us_clean.reset_index(drop=True)



In [76]:

    
print us.shape, us_clean.shape









    



(2345667, 31) (2320148, 31)



In [68]:

    
w = sorted(us_clean.week.unique())
print w, len(w)









    



['2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2015-13', '2015-14', '2015-15', '2015-16', '2015-17', '2015-18', '2015-19', '2015-20', '2015-21', '2015-22', '2015-23', '2015-24', '2015-25', '2015-26', '2015-27', '2015-28', '2015-29', '2015-30', '2015-31', '2015-32', '2015-33', '2015-34', '2015-35', '2015-36', '2015-37', '2015-38', '2015-39', '2015-40', '2015-41', '2015-42', '2015-43', '2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2016-13', '2016-14', '2016-15', '2016-16', '2016-17', '2016-18', '2016-19', '2016-20', '2016-21', '2016-22', '2016-23', '2016-24', '2016-25', '2016-26', '2016-27', '2016-28', '2016-29', '2016-30', '2016-31'] 80



In [77]:

    
type(us_clean['week'][0])









    Out[77]:





str



In [118]:

    
complete_data = us_clean[us_clean['week'].isin(['2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10'])]
complete_data['cost'] = complete_data['costmillicent'].apply(lambda y: y/100000.)
complete_data = complete_data[complete_data['isDradisEmployer']==1]









    



/Users/aristotle/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app



In [119]:

    
z = complete_data.groupby(['advertiserId'])['cost'].sum()



In [120]:

    
import seaborn as sns



In [121]:

    
%matplotlib inline



In [122]:

    
#sns.distplot(z.tolist())



In [123]:

    
final = sorted(z.tolist())



In [128]:

    
import numpy as np
sns.boxplot(np.log10(final))









    Out[128]:





<matplotlib.axes._subplots.AxesSubplot at 0x14bb70dd0>



In [129]:

    
np.median(final),np.mean(final)









    Out[129]:





(86.219999999999999, 166.32351682243859)



In [130]:

    
sns.distplot(np.log10(final))
len(final)









    Out[130]:





10785



In [ ]: