What is the value of a customer?

Exploring how we can classify a customer to some particular type. The plan of attack is to do some:

Data exploration.
Do PCA to explore distribution of variance.
Explore outliers.
Cluster using GMM on the transformed.



In [2]:

    
# Import libraries necessary for this project
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.5) 
sns.set_style("white")

# Pretty display for notebooks
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as grd
import matplotlib.ticker as tkr
import matplotlib.font_manager as font_manager

from matplotlib.ticker import AutoMinorLocator
from matplotlib.ticker import FuncFormatter
from matplotlib import rcParams

#define plotter
minorLocatorx   = AutoMinorLocator(10)
minorLocatory   = AutoMinorLocator(4)
matplotlib.rc('xtick', labelsize=16) 
matplotlib.rc('ytick', labelsize=16) 
matplotlib.rcParams['axes.linewidth'] = 2.
plt.rcParams['axes.linewidth'] = 4
plt.rc('font', family='serif');
plt.rc('font', serif='Times New Roman'); 
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 15
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size



In [3]:

    
# Read data
data = pd.read_csv('./ltv20160906_test.csv', delimiter='^')
data_80 = pd.read_csv('./ltv20160906_train.csv', delimiter='^')
data_all = pd.concat([data, data_80])



In [5]:



In [4]:

    
# convert week into year and week_number column 
data_all['year'] = data_all['week'].apply(lambda y: int(y[0:4]))
data_all['week_number'] = data_all['week'].apply(lambda y: int(y[-2:]))



In [12]:

    
#consider only US and Dradis data
#rescale costmillicent and then rename it
dataUS = data_all[data_all.jobCountry == 'US']
dataUS = dataUS[dataUS['isDradisEmployer']==1]
dataUS = dataUS.reset_index(drop=True)
dataUS['cost'] = dataUS.costmillicent/1e5



In [13]:

    
#compute week rank
weekRank = dataUS.groupby('advertiserId')['week'].rank(ascending=True)



In [14]:

    
#sanity check
print dataUS.shape, weekRank.shape









    



(2284758, 32) (2284758,)



In [15]:

    
#add weekRank
dataUS['week_rank'] = weekRank



In [16]:

    
#clean employee count
dataUS['employeeCount'] = dataUS['employeeCount'].apply(lambda y: '0-' if type(y)!=str else y.replace('+','-'))
print sorted(dataUS.employeeCount.unique())
dataUS['employeeCount'] = dataUS['employeeCount'].apply(lambda y: int(y[0:y.find('-')]))









    



['0-', '1-49', '1000-', '150-249', '250-499', '50-149', '500-749', '750-999']



In [17]:

    
# clean industry
dataUS['industry'] = dataUS['industry'].apply(lambda y: 'Blank' if type(y)!=str else y)



In [22]:

    
# check for double entries
double = dataUS.groupby(['advertiserId','week']).count()
print double.employeeCount.unique() # should be 1

[1]



In [23]:

    
# check how many weeks does advertiser spend
np.array(set(dataUS.groupby('advertiserId')['week'].count().values))









    Out[23]:





array(set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80]), dtype=object)



In [27]:

    
# remove data points that does not make sense 
print dataUS.shape
dataUS = dataUS[dataUS.week!='2016-53']
print dataUS.shape









    



(2284758, 33)
(2259871, 33)



In [29]:

    
# Sanity check
w = sorted(dataUS.spendAgeWeek_c.unique())
print 'spendAgeWeek_c values'
print w, len(w)
print "Possible values for 'how many weeks an advertiser spend'"
print np.array(set(dataUS.groupby('advertiserId')['week'].count().values))









    



spendAgeWeek_c values
[-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79] 82
Possible values for 'how many weeks an advertiser spend'
set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79])



In [30]:

    
#filter out SpendAgeWeek_c <0
dataUS = dataUS[(dataUS['spendAgeWeek_c']<74)&(dataUS['spendAgeWeek_c']>=0)]



In [31]:

    
# Sanity check
w = sorted(dataUS.spendAgeWeek_c.unique())
print 'spendAgeWeek_c values'
print w, len(w)
print "Possible values for 'how many weeks an advertiser spend'"
print np.array(set(dataUS.groupby('advertiserId')['week'].count().values))









    



spendAgeWeek_c values
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73] 74
Possible values for 'how many weeks an advertiser spend'
set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74])



In [32]:

    
dataUS.columns









    Out[32]:





Index([u'advertiserId', u'isDradisEmployer', u'salesTouched', u'week',
       u'costmillicent', u'jobCountry', u'company', u'employeeCount',
       u'dateCreated', u'firstRevenueDate', u'lastRevenueDate', u'isAdAgency',
       u'industry', u'type', u'state', u'city', u'conversionTime',
       u'hasDradisJob', u'sid', u'kw', u'outgoingEmail', u'outgoingCall',
       u'advertiserAgeWeek', u'spendAgeWeek', u'spendAgeWeek_c',
       u'advertiserAgeWeek_c', u'conversionTime_c', u'sales_touch', u'source',
       u'year', u'week_number', u'cost', u'week_rank'],
      dtype='object')



In [36]:

    
dataUS.sort_values(['advertiserId', 'week'], ascending=[1, 1], inplace=True)



In [134]:

    
# Sanity Check
xy = dataUS.groupby('advertiserId')['cost'].sum().values
sns.distplot(np.log10(xy))
print np.median(xy),np.mean(xy)









    



270.065 1027.91784788



In [78]:

    
#pick only important columns
data_clean = dataUS[['advertiserId','week', 'week_rank', 'spendAgeWeek_c', 'cost', 'employeeCount', 'isAdAgency',
                 'industry', 'type', 'source', 'salesTouched']]



In [136]:

    
#sanity_check : should have the same rows
print data_clean.shape, dataUS.shape









    



(2256794, 13) (2256794, 33)



In [79]:

    
# Sanity Check: Advertiser are not label twice
advertisers = data_clean.groupby(['advertiserId']).count()
advertisers_with_info = data_clean.groupby(['advertiserId', 'employeeCount', 'isAdAgency', 'industry',
                                            'type', 'source', 'salesTouched']).count() 
print 'Total advertisers:', advertisers.shape
print 'Total advertisers with labels:', advertisers_with_info.shape









    



Total advertisers: (224734, 10)
Total advertisers with labels: (224734, 4)



In [114]:

    
data_clean['first_four_weeks'] = data_clean['spendAgeWeek_c'].apply(lambda y: 1.0 if y<4 else 0.)









    



/Users/aristotle/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [115]:

    
data_clean['cost_first_four_weeks'] = data_clean['first_four_weeks']*data_clean['cost']









    



/Users/aristotle/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [137]:

    
#sanity_check : should have the same rows
print data_clean.shape, dataUS.shape









    



(2256794, 13) (2256794, 33)



In [138]:

    
#Aggregate data
agg_data = data_clean.groupby(['advertiserId', 'employeeCount', 'isAdAgency', 'industry',
                                            'type', 'source', 'salesTouched']).sum()



In [140]:

    
agg_data.reset_index(inplace=True)



In [144]:

    
agg_data = agg_data[['advertiserId', 'employeeCount', 'isAdAgency', 'industry',
                                            'type', 'source', 'salesTouched', 'cost', 'cost_first_four_weeks']]



In [149]:

    
agg_data.head()









    Out[149]:






  
    
      
      advertiserId
      employeeCount
      isAdAgency
      industry
      type
      source
      salesTouched
      cost
      cost_first_four_weeks
    
  
  
    
      0
      2505904
      1
      0
      Other
      Direct Employer
      organic
      1
      24.56
      24.56
    
    
      1
      2505922
      0
      0
      Hospitality/Travel
      Direct Employer
      others
      0
      442.49
      92.28
    
    
      2
      2505985
      1
      0
      Spam
      Direct Employer
      others
      1
      200.32
      187.42
    
    
      3
      2506047
      1
      0
      Blank
      none
      organic
      0
      11.31
      11.31
    
    
      4
      2506048
      1
      0
      Blank
      Direct Employer
      organic
      0
      320.82
      115.98



In [146]:

    
cost_data = agg_data['cost'].values



In [147]:

    
sns.distplot(np.log10(cost_data))
print np.median(cost_data),np.mean(cost_data)
print np.median(np.log10(cost_data)),np.mean(np.log10(cost_data))
print 10.**np.median(np.log10(cost_data)),10.**np.mean(np.log10(cost_data))









    



270.065 1027.91784788
2.43146830388 2.41289335352
270.064999954 258.75774249



In [170]:

    
complete_data = dataUS[dataUS['spendAgeWeek_c']==73]



In [171]:

    
complete_data.shape









    Out[171]:





(1394, 33)



In [172]:

    
data_with_full_run = dataUS[dataUS['advertiserId'].isin(sorted(complete_data.advertiserId.unique()))]
data_with_full_run_early = data_with_full_run[data_with_full_run['spendAgeWeek_c']==0]



In [173]:

    
print sorted(data_with_full_run_early.week.unique())









    



['2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11']



In [180]:

    
advertiser_complete_cycle = sorted(complete_data.advertiserId.unique())



In [184]:

    
advertiser_incomplete_cycle = list(set(agg_data.advertiserId.unique()) - set(advertiser_complete_cycle))



In [185]:

    
print len(advertiser_complete_cycle), len(advertiser_incomplete_cycle)









    



1394 223340

CLEAN DATA FINALLY!



In [186]:

    
test_data = agg_data[agg_data['advertiserId'].isin(advertiser_complete_cycle)]



In [187]:

    
to_be_predicted_data = agg_data[agg_data['advertiserId'].isin(advertiser_incomplete_cycle)]



In [189]:

    
print test_data.shape, to_be_predicted_data.shape









    



(1394, 9) (223340, 9)

Important Cols

advertiserId week costmillicent



In [110]:

    
## Only consider these columns.
columns = ['employeeCount', 'isDradisEmployer', 'salesTouched', 'isAdAgency', 
'industry', 'type', 'conversionTime_c', 'hasDradisJob', 'advertiserAgeWeek_c',
'sales_touch', 'source']
print len(columns)



In [95]:

    
#Make dummies for categorical features.
def preprocess_features(X):
    ''' Preprocesses the student data and converts non-numeric binary variables into
        binary (0/1) variables. Converts categorical variables into dummy variables. '''
    
    # Initialize new output DataFrame
    output = pd.DataFrame(index = X.index)

    # Investigate each feature column for the data
    for col, col_data in X.iteritems():

        # If data type is non-numeric, replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['yes', 'no'], [1, 0])

        # If data type is categorical, convert to dummy variables
        if col_data.dtype == object:
            # Example: 'GarageCond' => 'GarageCond_Ex', etc.
            col_data = pd.get_dummies(col_data, prefix = col)  
        
        # Collect the revised columns
        output = output.join(col_data)
    
    return output



In [96]:

    
def remove_nans(df):
    nan_df = df[pd.isnull(df).any(axis=1)]
    for col in nan_df.columns:
        tmp = nan_df[col][pd.isnull(nan_df[col])]
        if len(tmp) >= 1:
            #print "These features have NULLs to signify the absence of such a feature:"
            print col, len(tmp)
            df[col] = df[col].fillna(0.)
    print np.shape(df)
    return df



In [97]:

    
## Define and remove repeat outliers
def remove_outliers(df):
    outliers  = []
    for feature in df.columns:
        
        # Calculate Q1 (25th percentile of the data) for the given feature
        Q1 = np.percentile(df[feature], 25)
        
        # Calculate Q3 (75th percentile of the data) for the given feature
        Q3 = np.percentile(df[feature], 75)
        
        # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
        step = 1.5*(Q3-Q1)
        
        outliers = list(outliers + df[~((df[feature] >= Q1 - step) & \
                                                  (df[feature] <= Q3 + step))].index.tolist())
    
    # Select the indices for data points you wish to remove
    from collections import Counter
    counts = Counter(outliers)
    outliers = sorted([value for value, count in counts.items() if count > 2])
    return outliers



In [98]:

    
## Remove NaNs
#data_clean = remove_nans(dataUS)
#data_clean = preprocess_features(dataUS)
## Remove outliers
#outliers = remove_outliers(data)
#good_data = data_clean.drop(data_clean.index[outliers]).reset_index(drop = True)
#good_data_X = good_data.drop(['costmillicent'], axis =1)
#good_data_y =  good_data['costmillicent']
#print "The dimensions of the data without outliers now is {}".format(np.shape(good_data_X))



In [99]:

    
#Q1 = np.percentile(dataUS['costmillicent'], 25)
#Q3 = np.percentile(dataUS['costmillicent'], 75)
#step = 1.5*(Q3-Q1)
#outliers = list(dataUS[~((dataUS['costmillicent'] >= Q1 - step) & \
#                                                  (dataUS['costmillicent'] <= Q3 + step))].index.tolist())
#good_data = dataUS.drop(dataUS.index[outliers]).reset_index(drop = True)
#g = sns.FacetGrid(good_data, col="source")  
#g.map(sns.distplot, "costmillicent")  
#plt.xlim(0,)



In [84]:

    
us_clean = dataUS[dataUS.week!='2016-53']
complete_data = us_clean[us_clean['week'].isin(['2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10'])]
complete_data['cost'] = complete_data['costmillicent']
complete_data = complete_data[complete_data['isDradisEmployer']==1]









    



/Users/gmsardane/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()



In [85]:

    
tmp = complete_data.groupby(['advertiserId'])
new_complete_data = pd.DataFrame(complete_data.groupby(['advertiserId', 'source', 'type', 'conversionTime_c',
                                                        'hasDradisJob', 'advertiserAgeWeek_c',
                                                        'isAdAgency']).agg({'cost': np.sum, 
                                                                                       'sales_touch': np.mean, 
                                                                    'advertiserId': np.count_nonzero}))
#z = complete_data.groupby(['advertiserId'])['cost'].sum()
#final = sorted(z.tolist())
#np.shape(final)
new_complete_data['CountAdvertiserID'] =  new_complete_data.advertiserId
new_complete_data = new_complete_data.drop(['advertiserId'], axis=1)
new_complete_data['mean_sales_touch'] =  new_complete_data.sales_touch
new_complete_data = new_complete_data.drop(['sales_touch'], axis=1)

grouped = new_complete_data.reset_index()
print grouped.shape
grouped.head()









    



(10785, 10)






    Out[85]:






  
    
      
      advertiserId
      source
      type
      conversionTime_c
      hasDradisJob
      advertiserAgeWeek_c
      isAdAgency
      cost
      CountAdvertiserID
      mean_sales_touch
    
  
  
    
      0
      2505904
      organic
      Direct Employer
      0
      1
      0
      0
      24.56
      2
      1.00
    
    
      1
      2505985
      others
      Direct Employer
      12
      1
      1
      0
      187.42
      4
      0.25
    
    
      2
      2506048
      organic
      Direct Employer
      0
      1
      0
      0
      183.95
      6
      0.00
    
    
      3
      2506654
      direct
      Direct Employer
      0
      1
      0
      0
      232.65
      4
      0.00
    
    
      4
      2506768
      direct
      none
      0
      1
      0
      0
      66.32
      6
      0.00



In [106]:

    
columns = ['employeeCount', 'isDradisEmployer', 'salesTouched', 'isAdAgency', 
'industry', 'type', 'conversionTime_c', 'hasDradisJob', 'advertiserAgeWeek_c', 'advertiserAgeWeek_c',
'sales_touch', 'source']
tmp = []
for adID in grouped.advertiserId:
    tmp.append(list(set(complete_data[complete_data.advertiserId == adID]['industry']))[0])



In [109]:

    
grouped['industry'] = tmp

print np.shape(grouped)
grouped.head()









    



(10785, 11)






    Out[109]:






  
    
      
      advertiserId
      source
      type
      conversionTime_c
      hasDradisJob
      advertiserAgeWeek_c
      isAdAgency
      cost
      CountAdvertiserID
      mean_sales_touch
      industry
    
  
  
    
      0
      2505904
      organic
      Direct Employer
      0
      1
      0
      0
      24.56
      2
      1.00
      Other
    
    
      1
      2505985
      others
      Direct Employer
      12
      1
      1
      0
      187.42
      4
      0.25
      Spam
    
    
      2
      2506048
      organic
      Direct Employer
      0
      1
      0
      0
      183.95
      6
      0.00
      NaN
    
    
      3
      2506654
      direct
      Direct Employer
      0
      1
      0
      0
      232.65
      4
      0.00
      NaN
    
    
      4
      2506768
      direct
      none
      0
      1
      0
      0
      66.32
      6
      0.00
      NaN



In [121]:

    
## Preprocess features with NaNs and Catgoricals
data_clean = remove_nans(grouped)
data_clean = preprocess_features(data_clean)









    



(10785, 11)



In [125]:

    
data_clean_train = data_clean.drop(['advertiserId'], axis=1)
data_clean_train_y = data_clean_train['cost']
data_clean_train_X = data_clean.drop(['cost'], axis=1)



In [179]:

    
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Calculate the performance score between 'y_true' and 'y_predict'
    score = r2_score(y_true, y_predict)
    
    # Return the score
    return score

def create_sets(X, y):
    # Split into training and testing components first
    
    # Set the number of training points
    num_train = int(round(0.80*len(X)))
    
    # Set the number of testing points
    num_test = X.shape[0] - num_train
    
    # Shuffle and split the dataset into the number of training and testing points above
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = num_train, random_state=123458)
    
    # Show the results of the split
    print "Training set has {} samples.".format(X_train.shape[0])
    print "Testing set has {} samples.".format(X_test.shape[0])
    return X_train, X_test, y_train, y_test



In [180]:

    
def fit_model(X, y, n_estimators = 1000, min_samples_leaf = 1, min_samples_split =2):
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20)
    regressor = GradientBoostingRegressor(loss= 'lad', n_estimators=n_estimators,min_samples_leaf = 1, 
                                          min_samples_split =2,random_state=102345)

    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'learning_rate': [0.01, 0.02, 0.05, 0.1], # 'n_estimators': [100, 500, 1000],
              'max_depth': [2, 3, 4, 5, 6], 'min_samples_leaf':  [2,3,5,7]}
    # Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)
    # Create the grid search object
    grid = GridSearchCV(regressor, params, cv=cv_sets, scoring = scoring_fnc)#, verbose=10)
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    # Print the optimal model after fitting the data
    print 'The cross-validation scores = {:.2f}'.format(grid.best_score_)
    print grid.best_estimator_
    return grid



In [182]:

    
## Train-test split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split, ShuffleSplit
from sklearn.grid_search import GridSearchCV

X_train, X_test, y_train, y_test = create_sets(data_clean_train_X, np.log10(data_clean_train_y))
reg0 = fit_model(X_train, y_train)









    



Training set has 8628 samples.
Testing set has 2157 samples.
The cross-validation scores = 0.47
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.01,
             loss='lad', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=7, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', random_state=102345, subsample=1.0, verbose=0,
             warm_start=False)



In [17]:

    
#define plotter
minorLocatorx   = AutoMinorLocator(10)
minorLocatory   = AutoMinorLocator(4)
matplotlib.rc('xtick', labelsize=16) 
matplotlib.rc('ytick', labelsize=16) 
matplotlib.rcParams['axes.linewidth'] = 2.
plt.rcParams['axes.linewidth'] = 4
plt.rc('font', family='serif')
plt.rc('font', serif='Times New Roman') 
fig_size = plt.rcParams["figure.figsize"]
plt.rcParams["figure.figsize"] = fig_size
fig_size[0] = 12
fig_size[1] = 12
plt.rcParams["figure.figsize"] = fig_size
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True,gridspec_kw={"height_ratios": (.15, .85)})

sns.boxplot(np.log10(final), ax=ax_box, color='Pink');
sns.distplot(np.log10(final), ax=ax_hist, color='grey', norm_hist=False, bins=25, hist_kws={'alpha': 0.8, 'color': 'pink'});

ax_box.set(yticks=[], xlabel='log$_{10}$ [Sale Price [USD]]')
sns.despine(ax=ax_hist);
sns.despine(ax=ax_box, left=True)

plt.xlabel('log$_{10}$ [Cost [USD]]', fontsize=35)
plt.ylabel('Counts',  fontsize=35);
#plt.xlim(1e4, 1e6);
plt.yticks(fontsize=25);
plt.xticks(fontsize=25);#, format)
#plt.semilogx();
#plt.savefig('Cost.pdf', dpi=400)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-419a5284837d> in <module>()
     15 f, (ax_box, ax_hist) = plt.subplots(2, sharex=True,gridspec_kw={"height_ratios": (.15, .85)})
     16 
---> 17 sns.boxplot(np.log10(final), ax=ax_box, color='Pink');
     18 sns.distplot(np.log10(final), ax=ax_hist, color='grey', norm_hist=False, bins=25, hist_kws={'alpha': 0.8, 'color': 'pink'});
     19 

NameError: name 'final' is not defined

Now I want to train these roughly 11,000 US data set that have data all throughout its lifetime. I want to predict the cost given all/any of the features.



In [ ]:

    
part_comp_data = complete_data[list(set(columns))]



In [ ]:

    
part_comp_data.head()



In [ ]:

    
data_clean = remove_nans(part_comp_data)
data_clean = preprocess_features(data_clean)
## Remove outliers
#outliers = remove_outliers(data)
#good_data = data_clean.drop(data_clean.index[outliers]).reset_index(drop = True)
#good_data_X = good_data.drop(['costmillicent'], axis =1)
#good_data_y =  good_data['costmillicent']
print "The dimensions of the data without outliers now is {}".format(np.shape(data_clean))
data_clean.columns



In [ ]:

    
len(complete_data)



In [ ]:

    
complete_data.columns



In [ ]:

    
set(complete_data.sales_touch)



In [ ]:

	advertiserId	employeeCount	industry	type	source	salesTouched	cost	cost_first_four_weeks
0	2505904	1	Other	Direct Employer	organic	1	24.56	24.56
1	2505922	0	Hospitality/Travel	Direct Employer	others	0	442.49	92.28
2	2505985	1	Spam	Direct Employer	others	1	200.32	187.42
3	2506047	1	Blank	none	organic	0	11.31	11.31
4	2506048	1	Blank	Direct Employer	organic	0	320.82	115.98