In [1]:

    
%matplotlib inline

Initial Data Cleaning and Exploration

Code for the initial data cleaning and exploration done before modeling
Author: Jimmy Charité
Email: jimmy.charite@gmail.com
Date: January 8, 2017

Directory & Packages



In [2]:

    
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

The default directory is the code subdirectory. Changing to the main repo directory above.



In [3]:

    
retval=os.chdir("..")

Helper Functions



In [4]:

    
def pd_tab(df,col,sort_by='count',asc=False):
    tab=df[col].value_counts(dropna=False).reset_index(name='count')
    tab.columns=[col,'count']
    tab['percent']=tab['count']/tab['count'].sum()
    tab.sort_values(by=sort_by,inplace=True,ascending=asc)
    return tab

Upload Data



In [5]:

    
raw_data=pd.read_csv("./raw_data/data",header=None)
raw_data.head()









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[5]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1549
      1550
      1551
      1552
      1553
      1554
      1555
      1556
      1557
      1558
    
  
  
    
      0
      125
      125
      1.0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      1
      57
      468
      8.2105
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      2
      33
      230
      6.9696
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      3
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      4
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
  

5 rows × 1559 columns



In [6]:

    
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
                     sep=":")
col_names.head()









    Out[6]:






  
    
      
      0
      1
    
  
  
    
      0
      height
      continuous.
    
    
      1
      width
      continuous.
    
    
      2
      aratio
      continuous.
    
    
      3
      local
      0,1.
    
    
      4
      url*images+buttons
      0,1.



In [7]:

    
col_names.columns=['variable','type']



In [8]:

    
col_names=pd.concat((col_names,
                     pd.DataFrame({'variable':['image_type'],
                                   'type':['0,1.'] })),axis=0)
col_names=col_names[['variable','type']]



In [9]:

    
col_names.head()









    Out[9]:






  
    
      
      variable
      type
    
  
  
    
      0
      height
      continuous.
    
    
      1
      width
      continuous.
    
    
      2
      aratio
      continuous.
    
    
      3
      local
      0,1.
    
    
      4
      url*images+buttons
      0,1.



In [10]:

    
raw_data.columns=list(col_names.variable)
raw_data.head()









    Out[10]:






  
    
      
      height
      width
      aratio
      local
      url*images+buttons
      url*likesbooks.com
      url*www.slake.com
      url*hydrogeologist
      url*oso
      url*media
      ...
      caption*home
      caption*my
      caption*your
      caption*in
      caption*bytes
      caption*here
      caption*click
      caption*for
      caption*you
      image_type
    
  
  
    
      0
      125
      125
      1.0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      1
      57
      468
      8.2105
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      2
      33
      230
      6.9696
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      3
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      4
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
  

5 rows × 1559 columns

Data Quality

Data Types



In [11]:

    
data_types=raw_data.dtypes.reset_index()
data_types.columns=['variable','d_type']



In [12]:

    
pd_tab(data_types,'d_type')

All the features will be made numerical.

Non-Numerical Values

Non-numerical features



In [13]:

    
data_types[data_types.d_type=='object']









    Out[13]:






  
    
      
      variable
      d_type
    
  
  
    
      0
      height
      object
    
    
      1
      width
      object
    
    
      2
      aratio
      object
    
    
      3
      local
      object
    
    
      1558
      image_type
      object



In [14]:

    
temp=raw_data[np.isnan(pd.to_numeric(raw_data.height,errors='coerce'))]['height']
np.unique(temp)









    Out[14]:





array(['   ?'], dtype=object)



In [15]:

    
temp=raw_data[np.isnan(pd.to_numeric(raw_data.width,errors='coerce'))]['width']
np.unique(temp)









    Out[15]:





array(['   ?'], dtype=object)



In [16]:

    
temp=raw_data[np.isnan(pd.to_numeric(raw_data.aratio,errors='coerce'))]['aratio']
np.unique(temp)









    Out[16]:





array(['     ?'], dtype=object)



In [17]:

    
temp=raw_data[np.isnan(pd.to_numeric(raw_data.local,errors='coerce'))]['local']
np.unique(temp)









    Out[17]:





array(['?'], dtype=object)

The non-numerical values enter as "?"

Inspecting the Continuous Variables

Converting the image type variable into numeric



In [18]:

    
pd_tab(raw_data,'image_type')









    Out[18]:






  
    
      
      image_type
      count
      percent
    
  
  
    
      0
      nonad.
      2820
      0.860018
    
    
      1
      ad.
      459
      0.139982



In [19]:

    
clean_data=raw_data.copy()



In [20]:

    
clean_data.replace({'image_type': {'nonad.':0,'ad.':1}},inplace=True)
clean_data.head()









    Out[20]:






  
    
      
      height
      width
      aratio
      local
      url*images+buttons
      url*likesbooks.com
      url*www.slake.com
      url*hydrogeologist
      url*oso
      url*media
      ...
      caption*home
      caption*my
      caption*your
      caption*in
      caption*bytes
      caption*here
      caption*click
      caption*for
      caption*you
      image_type
    
  
  
    
      0
      125
      125
      1.0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1
      57
      468
      8.2105
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      33
      230
      6.9696
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      3
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 1559 columns

Converting all other variables into numeric



In [21]:

    
clean_data=clean_data.apply(lambda row: pd.to_numeric(row,errors='coerce'))
clean_data.head()









    Out[21]:






  
    
      
      height
      width
      aratio
      local
      url*images+buttons
      url*likesbooks.com
      url*www.slake.com
      url*hydrogeologist
      url*oso
      url*media
      ...
      caption*home
      caption*my
      caption*your
      caption*in
      caption*bytes
      caption*here
      caption*click
      caption*for
      caption*you
      image_type
    
  
  
    
      0
      125.0
      125.0
      1.0000
      1.0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1
      57.0
      468.0
      8.2105
      1.0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      33.0
      230.0
      6.9696
      1.0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      3
      60.0
      468.0
      7.8000
      1.0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      60.0
      468.0
      7.8000
      1.0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 1559 columns

Inspecting the 'Height' feature



In [22]:

    
clean_data[np.isnan(clean_data.height)==False].height.describe()









    Out[22]:





count    2376.000000
mean       64.021886
std        54.868604
min         1.000000
25%        25.000000
50%        51.000000
75%        85.250000
max       640.000000
Name: height, dtype: float64



In [23]:

    
g=sns.distplot(clean_data[np.isnan(clean_data.height)==False].height)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[23]:





<matplotlib.text.Text at 0x7f6f16c9e668>



In [24]:

    
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.height)==False].height))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[24]:





<matplotlib.text.Text at 0x7f6f16c5d2b0>

Taking the log of the continuous variables can be an optional pipeline step during the model training stage. In theory, many of the parametric models like the logistic classifier benefit from (standardized) approximately symmetric distributions.

Inspecting the 'Width' feature



In [25]:

    
clean_data[np.isnan(clean_data.width)==False].width.describe()









    Out[25]:





count    2378.000000
mean      155.344828
std       130.032350
min         1.000000
25%        80.000000
50%       110.000000
75%       184.000000
max       640.000000
Name: width, dtype: float64



In [26]:

    
g=sns.distplot(clean_data[np.isnan(clean_data.width)==False].width)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[26]:





<matplotlib.text.Text at 0x7f6f169cad30>



In [27]:

    
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.width)==False].width))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[27]:





<matplotlib.text.Text at 0x7f6f1688c240>

Widths are bimodal and there isn't simple transformation to address it. I may experiment with using categorical variables for the width.

Inspecting the aspect ratio feature



In [28]:

    
clean_data[np.isnan(clean_data.aratio)==False].aratio.describe()









    Out[28]:





count    2369.000000
mean        3.911953
std         6.042986
min         0.001500
25%         1.035700
50%         2.102000
75%         5.333300
max        60.000000
Name: aratio, dtype: float64



In [29]:

    
g=sns.distplot(clean_data[np.isnan(clean_data.aratio)==False].aratio)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[29]:





<matplotlib.text.Text at 0x7f6f167993c8>



In [30]:

    
100*len(clean_data[clean_data.aratio>10])/len(clean_data) #1.25% 10+









    Out[30]:





1.2503812137846904



In [31]:

    
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.aratio)==False].aratio))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[31]:





<matplotlib.text.Text at 0x7f6f166ce1d0>

Taking the log of the aspect ratio improves the symmetry of the distribution, but it is not approximately normal one. On the non-logged scale, the aspect ratios greater than 10 definitely look like outliers, however, they are still within reason for aspect ratios for images. I created an image with an aspect ratio of 60 to confirm. Dropping outliers may result in an inability to make predictions for certain instances, which isn't always practical. Therefore, initially, I will include algorithms that are robust to outliers before experimenting with removing them.

Inspecting the Binary Variables



In [32]:

    
temp=clean_data.mean().reset_index(name='Percent')
temp.columns=['Variable','Percent']
temp=temp[3:] #remove the continuous ones
temp['Percent']=np.round(temp['Percent']*100,2)
temp.sort_values(by='Percent',inplace=True,ascending=False)



In [33]:

    
temp.head()









    Out[33]:






  
    
      
      Variable
      Percent
    
  
  
    
      3
      local
      76.72
    
    
      187
      url*images
      28.79
    
    
      917
      origurl*www.geocities.com
      18.48
    
    
      548
      origurl*geocities.com
      18.48
    
    
      274
      url*www.geocities.com
      17.72



In [34]:

    
temp.tail()









    Out[34]:






  
    
      
      Variable
      Percent
    
  
  
    
      1332
      ancurl*slagen
      0.15
    
    
      1308
      ancurl*cuteftp
      0.15
    
    
      1557
      caption*you
      0.15
    
    
      1094
      ancurl*bridalinfo
      0.15
    
    
      1125
      ancurl*schwab
      0.12



In [35]:

    
g=sns.distplot(temp.Percent)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,100)
g.axes.set_title('Distribution of Sparsity of Binary Variables\n',fontsize=20)
g.set_xlabel('Percent of Affirmative/True Instances',fontsize=15)









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    Out[35]:





<matplotlib.text.Text at 0x7f6f16793198>



In [36]:

    
len(temp[temp.Percent<10])/len(temp) #Percent of binary features less than 10% affirmative









    Out[36]:





0.993573264781491



In [37]:

    
len(temp[temp.Percent<1])/len(temp) #Percent of binary features less than 1% affirmative









    Out[37]:





0.8637532133676092



In [38]:

    
len(temp[temp.Percent<.1])/len(temp)  #Percent of binary features less than 0.1% affirmative









    Out[38]:





0.0

The feature space for the binary variables is sparse: 99% of the binary variables are affirmative less than 10% of the time and 86% are affirmative less than 1% of the time.

In addition to using algorithms robust to sparse features, I may experiment with cross-validation driven feature selection (like 'VarianceThreshold').

Inspecting with Missing Values

Rows with all missing data



In [39]:

    
len(clean_data)-len(clean_data.dropna(axis=0,how='all'))









    Out[39]:





0

Rows with any missing data



In [40]:

    
len(clean_data)-len(clean_data.dropna(axis=0,how='any'))









    Out[40]:





920

Counting missing instances by variable



In [41]:

    
temp=clean_data.isnull().sum().reset_index()
temp.columns=['variable','missing']
temp.sort_values(by='missing',inplace=True,ascending=False)
temp['percent']=np.round(100*temp['missing']/len(clean_data),2)



In [42]:

    
temp[temp.missing>0]

Missing values in the height, width, and aspect ratio



In [43]:

    
len(clean_data[(clean_data.height.isnull()==False) & 
           (clean_data.width.isnull()==False) &
           (clean_data.aratio.isnull()==True) ])









    Out[43]:





0



In [44]:

    
len(clean_data[(clean_data.height.isnull()==False) & 
           (clean_data.width.isnull()==True) &
           (clean_data.aratio.isnull()==False) ])









    Out[44]:





0



In [45]:

    
len(clean_data[(clean_data.height.isnull()==True) & 
           (clean_data.width.isnull()==False) &
           (clean_data.aratio.isnull()==False) ])









    Out[45]:





0

With the current data, it is not possible to impute missing data in one continuous variable with complete data from the other two continuous variables.



In [46]:

    
clean_data['missing_aratio']=clean_data.aratio.isnull()
clean_data['image_type'].groupby(clean_data['missing_aratio']).mean()*100









    Out[46]:





missing_aratio
False    16.293795
True      8.021978
Name: image_type, dtype: float64



In [47]:

    
f='image_type ~ missing_aratio'
results = smf.glm(formula=f, data=clean_data, 
                  family=sm.families.Binomial()).fit()
print(results.summary())









    



                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:             image_type   No. Observations:                 3279
Model:                            GLM   Df Residuals:                     3277
Model Family:                Binomial   Df Model:                            1
Link Function:                  logit   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -1307.2
Date:                Mon, 09 Jan 2017   Deviance:                       2614.4
Time:                        11:36:17   Pearson chi2:                 3.28e+03
No. Iterations:                     7                                         
==========================================================================================
                             coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------
Intercept                 -1.6365      0.056    -29.417      0.000        -1.746    -1.527
missing_aratio[T.True]    -0.8028      0.134     -5.986      0.000        -1.066    -0.540
==========================================================================================

16% of instances with missing aspect ratios are ads and 8% of instances without missing aspect ratios are ads. The difference, in a univariate regression, is statistically significant.

In light of the large percent of missing values and the seemingly non-randomness of the missing values with respect to the feature being classified, I will represent the aspect ratio, height, and width as categorical variables with 'missing' being the reference category.



In [48]:

    
clean_data['missing_local']=clean_data.local.isnull()
clean_data['image_type'].groupby(clean_data['missing_local']).mean()*100









    Out[48]:





missing_local
False    13.909314
True     33.333333
Name: image_type, dtype: float64

The local variable is missing for only 15 observations. Assuming this is representative of the general rate at which it is missing, I will simply impute missing values to '0'. In practice, if the 'local' variable turns out to be an extremely important feature, I would discuss the issue with individuals in the company that fully understand how the data was generated to see if there are smarter imputation approaches available or if the data collection process can be changed to avoid missing information on this variable.

Cleaning and Saving Data for Modeling

Starting with the raw data, I apply everything I learned from the data exploration to preparing the raw data for modeling. The code below will be re-used in the standalone python script that will be used to make predictions on new raw data.

Upload Data

Main dataset



In [50]:

    
raw_data=pd.read_csv("./raw_data/data",header=None)
raw_data.head()









    



/home/jim/anaconda2/envs/py35/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[50]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      1549
      1550
      1551
      1552
      1553
      1554
      1555
      1556
      1557
      1558
    
  
  
    
      0
      125
      125
      1.0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      1
      57
      468
      8.2105
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      2
      33
      230
      6.9696
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      3
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
    
      4
      60
      468
      7.8
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ad.
    
  

5 rows × 1559 columns

Attach column names



In [51]:

    
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
                     sep=":")
col_names.columns=['variable','type']



In [52]:

    
col_names=pd.concat((col_names,
                     pd.DataFrame({'variable':['image_type'],
                                   'type':['0,1.'] })),axis=0)
col_names=col_names[['variable','type']]



In [53]:

    
raw_data.columns=list(col_names.variable)

Make Numerical



In [54]:

    
raw_data.replace({'image_type': {'nonad.':0,'ad.':1}},inplace=True)



In [55]:

    
raw_data=raw_data.apply(lambda row: pd.to_numeric(row,errors='coerce'))



In [56]:

    
raw_data.ix[raw_data.local.isnull(), 'local']=0

Make the Continuous Variables Categorical

Aspect Ratio



In [57]:

    
raw_data['aratio_cat']='aratio_NaN'
raw_data.ix[(raw_data.aratio>=0) & (raw_data.aratio<2), 'aratio_cat']='aratio_0t2'
raw_data.ix[(raw_data.aratio>=2) & (raw_data.aratio<4), 'aratio_cat']='aratio_2t4'
raw_data.ix[(raw_data.aratio>=4) & (raw_data.aratio<6), 'aratio_cat']='aratio_4t6'
raw_data.ix[(raw_data.aratio>=6) & (raw_data.aratio<8), 'aratio_cat']='aratio_6t8'
raw_data.ix[(raw_data.aratio>=8) & (raw_data.aratio<10), 'aratio_cat']='aratio_8t10'
raw_data.ix[(raw_data.aratio>=10), 'aratio_cat']='aratio_10t'



In [58]:

    
aspect_cats=pd.get_dummies(raw_data['aratio_cat'])
del aspect_cats['aratio_NaN'] #comparison category
del raw_data['aratio_cat']
aspect_cats.head()









    Out[58]:






  
    
      
      aratio_0t2
      aratio_10t
      aratio_2t4
      aratio_4t6
      aratio_6t8
      aratio_8t10
    
  
  
    
      0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0

Height



In [59]:

    
raw_data['height_cat']='height_NaN'
raw_data.ix[(raw_data.height>=0) & (raw_data.height<50), 'height_cat']='height_0t50'
raw_data.ix[(raw_data.height>=50) & (raw_data.height<100), 'height_cat']='height_50t100'
raw_data.ix[(raw_data.height>=100) & (raw_data.height<150), 'height_cat']='height_100t150'
raw_data.ix[(raw_data.height>=150) & (raw_data.height<200), 'height_cat']='height_150t200'
raw_data.ix[(raw_data.height>=200) & (raw_data.height<250), 'height_cat']='height_200t250'
raw_data.ix[(raw_data.height>=250) & (raw_data.height<300), 'height_cat']='height_250t300'
raw_data.ix[(raw_data.height>=300) & (raw_data.height<350), 'height_cat']='height_300t350'
raw_data.ix[(raw_data.height>=350) & (raw_data.height<400), 'height_cat']='height_350t400'
raw_data.ix[(raw_data.height>=400), 'height_cat']='height_400t'



In [60]:

    
height_cats=pd.get_dummies(raw_data['height_cat'])
del height_cats['height_NaN'] #comparison category
del raw_data['height_cat']
height_cats.head()









    Out[60]:






  
    
      
      height_0t50
      height_100t150
      height_150t200
      height_200t250
      height_250t300
      height_300t350
      height_350t400
      height_400t
      height_50t100
    
  
  
    
      0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      2
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0

Width



In [61]:

    
raw_data['width_cat']='width_NaN'
raw_data.ix[(raw_data.width>=0) & (raw_data.width<50), 'width_cat']='width_0t50'
raw_data.ix[(raw_data.width>=50) & (raw_data.width<100), 'width_cat']='width_50t100'
raw_data.ix[(raw_data.width>=100) & (raw_data.width<150), 'width_cat']='width_100t150'
raw_data.ix[(raw_data.width>=150) & (raw_data.width<200), 'width_cat']='width_150t200'
raw_data.ix[(raw_data.width>=200) & (raw_data.width<250), 'width_cat']='width_200t250'
raw_data.ix[(raw_data.width>=250) & (raw_data.width<300), 'width_cat']='width_250t300'
raw_data.ix[(raw_data.width>=300) & (raw_data.width<350), 'width_cat']='width_300t350'
raw_data.ix[(raw_data.width>=350) & (raw_data.width<400), 'width_cat']='width_350t400'
raw_data.ix[(raw_data.width>=400), 'width_cat']='width_400t'



In [62]:

    
width_cats=pd.get_dummies(raw_data['width_cat'])
del width_cats['width_NaN'] #comparison category
del raw_data['width_cat']
width_cats.head()









    Out[62]:






  
    
      
      width_0t50
      width_100t150
      width_150t200
      width_200t250
      width_250t300
      width_300t350
      width_350t400
      width_400t
      width_50t100
    
  
  
    
      0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0

Switching the categorical with binary variables



In [63]:

    
del raw_data['height'], raw_data['width'], raw_data['aratio']



In [64]:

    
raw_data=pd.concat([height_cats,width_cats,aspect_cats,raw_data], axis=1)
raw_data.head()









    Out[64]:






  
    
      
      height_0t50
      height_100t150
      height_150t200
      height_200t250
      height_250t300
      height_300t350
      height_350t400
      height_400t
      height_50t100
      width_0t50
      ...
      caption*home
      caption*my
      caption*your
      caption*in
      caption*bytes
      caption*here
      caption*click
      caption*for
      caption*you
      image_type
    
  
  
    
      0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 1580 columns

Without domain knowledge or clear business logic, turning continuous variables into a series of categorical variables is a mix of empiricism and guessing. I inspected the histograms and selected partitions that made sense. This part of the model building process can be refined through iteration.

Saving Final Modeling Dataset

Normally I pickle datasets. However, to make the code more portable, I will save it as a csv.



In [65]:

    
raw_data.to_csv("./clean_data/modeling_data.csv",index=False)

	0	1	2	3	...	1558
0	125	125	1.0	1	...	ad.
1	57	468	8.2105	1	...	ad.
2	33	230	6.9696	1	...	ad.
3	60	468	7.8	1	...	ad.
4	60	468	7.8	1	...	ad.

	0	1
0	height	continuous.
1	width	continuous.
2	aratio	continuous.
3	local	0,1.
4	url*images+buttons	0,1.

	variable	d_type
0	height	object
1	width	object
2	aratio	object
3	local	object
1558	image_type	object

	height	width	aratio	local	...	image_type
0	125.0	125.0	1.0000	1.0	...	1
1	57.0	468.0	8.2105	1.0	...	1
2	33.0	230.0	6.9696	1.0	...	1
3	60.0	468.0	7.8000	1.0	...	1
4	60.0	468.0	7.8000	1.0	...	1

	Variable	Percent
3	local	76.72
187	url*images	28.79
917	origurl*www.geocities.com	18.48
548	origurl*geocities.com	18.48
274	url*www.geocities.com	17.72

	Variable	Percent
1332	ancurl*slagen	0.15
1308	ancurl*cuteftp	0.15
1557	caption*you	0.15
1094	ancurl*bridalinfo	0.15
1125	ancurl*schwab	0.12

	aratio_0t2	aratio_6t8	aratio_8t10
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	1.0	0.0
4	0.0	1.0	0.0

	height_0t50	height_100t150	height_50t100
0	0.0	1.0	0.0
1	0.0	0.0	1.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	width_100t150	width_200t250	width_400t
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	height_0t50	height_100t150	height_50t100	...	image_type
0	0.0	1.0	0.0	...	1
1	0.0	0.0	1.0	...	1
2	1.0	0.0	0.0	...	1
3	0.0	0.0	1.0	...	1
4	0.0	0.0	1.0	...	1

	aratio_0t2	aratio_6t8	aratio_8t10
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	1.0	0.0
4	0.0	1.0	0.0

	height_0t50	height_100t150	height_50t100
0	0.0	1.0	0.0
1	0.0	0.0	1.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	width_100t150	width_200t250	width_400t
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	height_0t50	height_100t150	height_50t100	...	image_type
0	0.0	1.0	0.0	...	1
1	0.0	0.0	1.0	...	1
2	1.0	0.0	0.0	...	1
3	0.0	0.0	1.0	...	1
4	0.0	0.0	1.0	...	1

	aratio_0t2	aratio_6t8	aratio_8t10
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	1.0	0.0
4	0.0	1.0	0.0

	height_0t50	height_100t150	height_50t100
0	0.0	1.0	0.0
1	0.0	0.0	1.0
2	1.0	0.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	width_100t150	width_200t250	width_400t
0	1.0	0.0	0.0
1	0.0	0.0	1.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	0.0	1.0

	height_0t50	height_100t150	height_50t100	...	image_type
0	0.0	1.0	0.0	...	1
1	0.0	0.0	1.0	...	1
2	1.0	0.0	0.0	...	1
3	0.0	0.0	1.0	...	1
4	0.0	0.0	1.0	...	1