In [1]:
%matplotlib inline
In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
The default directory is the code subdirectory. Changing to the main repo directory above.
In [3]:
retval=os.chdir("..")
In [4]:
def pd_tab(df,col,sort_by='count',asc=False):
tab=df[col].value_counts(dropna=False).reset_index(name='count')
tab.columns=[col,'count']
tab['percent']=tab['count']/tab['count'].sum()
tab.sort_values(by=sort_by,inplace=True,ascending=asc)
return tab
In [5]:
raw_data=pd.read_csv("./raw_data/data",header=None)
raw_data.head()
Out[5]:
In [6]:
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
sep=":")
col_names.head()
Out[6]:
In [7]:
col_names.columns=['variable','type']
In [8]:
col_names=pd.concat((col_names,
pd.DataFrame({'variable':['image_type'],
'type':['0,1.'] })),axis=0)
col_names=col_names[['variable','type']]
In [9]:
col_names.head()
Out[9]:
In [10]:
raw_data.columns=list(col_names.variable)
raw_data.head()
Out[10]:
In [11]:
data_types=raw_data.dtypes.reset_index()
data_types.columns=['variable','d_type']
In [12]:
pd_tab(data_types,'d_type')
Out[12]:
All the features will be made numerical.
Non-numerical features
In [13]:
data_types[data_types.d_type=='object']
Out[13]:
In [14]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.height,errors='coerce'))]['height']
np.unique(temp)
Out[14]:
In [15]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.width,errors='coerce'))]['width']
np.unique(temp)
Out[15]:
In [16]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.aratio,errors='coerce'))]['aratio']
np.unique(temp)
Out[16]:
In [17]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.local,errors='coerce'))]['local']
np.unique(temp)
Out[17]:
The non-numerical values enter as "?"
Converting the image type variable into numeric
In [18]:
pd_tab(raw_data,'image_type')
Out[18]:
In [19]:
clean_data=raw_data.copy()
In [20]:
clean_data.replace({'image_type': {'nonad.':0,'ad.':1}},inplace=True)
clean_data.head()
Out[20]:
Converting all other variables into numeric
In [21]:
clean_data=clean_data.apply(lambda row: pd.to_numeric(row,errors='coerce'))
clean_data.head()
Out[21]:
Inspecting the 'Height' feature
In [22]:
clean_data[np.isnan(clean_data.height)==False].height.describe()
Out[22]:
In [23]:
g=sns.distplot(clean_data[np.isnan(clean_data.height)==False].height)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)
Out[23]:
In [24]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.height)==False].height))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)
Out[24]:
Taking the log of the continuous variables can be an optional pipeline step during the model training stage. In theory, many of the parametric models like the logistic classifier benefit from (standardized) approximately symmetric distributions.
Inspecting the 'Width' feature
In [25]:
clean_data[np.isnan(clean_data.width)==False].width.describe()
Out[25]:
In [26]:
g=sns.distplot(clean_data[np.isnan(clean_data.width)==False].width)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)
Out[26]:
In [27]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.width)==False].width))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)
Out[27]:
Widths are bimodal and there isn't simple transformation to address it. I may experiment with using categorical variables for the width.
Inspecting the aspect ratio feature
In [28]:
clean_data[np.isnan(clean_data.aratio)==False].aratio.describe()
Out[28]:
In [29]:
g=sns.distplot(clean_data[np.isnan(clean_data.aratio)==False].aratio)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)
Out[29]:
In [30]:
100*len(clean_data[clean_data.aratio>10])/len(clean_data) #1.25% 10+
Out[30]:
In [31]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.aratio)==False].aratio))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)
Out[31]:
Taking the log of the aspect ratio improves the symmetry of the distribution, but it is not approximately normal one. On the non-logged scale, the aspect ratios greater than 10 definitely look like outliers, however, they are still within reason for aspect ratios for images. I created an image with an aspect ratio of 60 to confirm. Dropping outliers may result in an inability to make predictions for certain instances, which isn't always practical. Therefore, initially, I will include algorithms that are robust to outliers before experimenting with removing them.
In [32]:
temp=clean_data.mean().reset_index(name='Percent')
temp.columns=['Variable','Percent']
temp=temp[3:] #remove the continuous ones
temp['Percent']=np.round(temp['Percent']*100,2)
temp.sort_values(by='Percent',inplace=True,ascending=False)
In [33]:
temp.head()
Out[33]:
In [34]:
temp.tail()
Out[34]:
In [35]:
g=sns.distplot(temp.Percent)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,100)
g.axes.set_title('Distribution of Sparsity of Binary Variables\n',fontsize=20)
g.set_xlabel('Percent of Affirmative/True Instances',fontsize=15)
Out[35]:
In [36]:
len(temp[temp.Percent<10])/len(temp) #Percent of binary features less than 10% affirmative
Out[36]:
In [37]:
len(temp[temp.Percent<1])/len(temp) #Percent of binary features less than 1% affirmative
Out[37]:
In [38]:
len(temp[temp.Percent<.1])/len(temp) #Percent of binary features less than 0.1% affirmative
Out[38]:
The feature space for the binary variables is sparse: 99% of the binary variables are affirmative less than 10% of the time and 86% are affirmative less than 1% of the time.
In addition to using algorithms robust to sparse features, I may experiment with cross-validation driven feature selection (like 'VarianceThreshold').
Rows with all missing data
In [39]:
len(clean_data)-len(clean_data.dropna(axis=0,how='all'))
Out[39]:
Rows with any missing data
In [40]:
len(clean_data)-len(clean_data.dropna(axis=0,how='any'))
Out[40]:
Counting missing instances by variable
In [41]:
temp=clean_data.isnull().sum().reset_index()
temp.columns=['variable','missing']
temp.sort_values(by='missing',inplace=True,ascending=False)
temp['percent']=np.round(100*temp['missing']/len(clean_data),2)
In [42]:
temp[temp.missing>0]
Out[42]:
Missing values in the height, width, and aspect ratio
In [43]:
len(clean_data[(clean_data.height.isnull()==False) &
(clean_data.width.isnull()==False) &
(clean_data.aratio.isnull()==True) ])
Out[43]:
In [44]:
len(clean_data[(clean_data.height.isnull()==False) &
(clean_data.width.isnull()==True) &
(clean_data.aratio.isnull()==False) ])
Out[44]:
In [45]:
len(clean_data[(clean_data.height.isnull()==True) &
(clean_data.width.isnull()==False) &
(clean_data.aratio.isnull()==False) ])
Out[45]:
With the current data, it is not possible to impute missing data in one continuous variable with complete data from the other two continuous variables.
In [46]:
clean_data['missing_aratio']=clean_data.aratio.isnull()
clean_data['image_type'].groupby(clean_data['missing_aratio']).mean()*100
Out[46]:
In [47]:
f='image_type ~ missing_aratio'
results = smf.glm(formula=f, data=clean_data,
family=sm.families.Binomial()).fit()
print(results.summary())
16% of instances with missing aspect ratios are ads and 8% of instances without missing aspect ratios are ads. The difference, in a univariate regression, is statistically significant.
In light of the large percent of missing values and the seemingly non-randomness of the missing values with respect to the feature being classified, I will represent the aspect ratio, height, and width as categorical variables with 'missing' being the reference category.
In [48]:
clean_data['missing_local']=clean_data.local.isnull()
clean_data['image_type'].groupby(clean_data['missing_local']).mean()*100
Out[48]:
The local variable is missing for only 15 observations. Assuming this is representative of the general rate at which it is missing, I will simply impute missing values to '0'. In practice, if the 'local' variable turns out to be an extremely important feature, I would discuss the issue with individuals in the company that fully understand how the data was generated to see if there are smarter imputation approaches available or if the data collection process can be changed to avoid missing information on this variable.
Starting with the raw data, I apply everything I learned from the data exploration to preparing the raw data for modeling. The code below will be re-used in the standalone python script that will be used to make predictions on new raw data.
Main dataset
In [50]:
raw_data=pd.read_csv("./raw_data/data",header=None)
raw_data.head()
Out[50]:
Attach column names
In [51]:
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
sep=":")
col_names.columns=['variable','type']
In [52]:
col_names=pd.concat((col_names,
pd.DataFrame({'variable':['image_type'],
'type':['0,1.'] })),axis=0)
col_names=col_names[['variable','type']]
In [53]:
raw_data.columns=list(col_names.variable)
In [54]:
raw_data.replace({'image_type': {'nonad.':0,'ad.':1}},inplace=True)
In [55]:
raw_data=raw_data.apply(lambda row: pd.to_numeric(row,errors='coerce'))
In [56]:
raw_data.ix[raw_data.local.isnull(), 'local']=0
Aspect Ratio
In [57]:
raw_data['aratio_cat']='aratio_NaN'
raw_data.ix[(raw_data.aratio>=0) & (raw_data.aratio<2), 'aratio_cat']='aratio_0t2'
raw_data.ix[(raw_data.aratio>=2) & (raw_data.aratio<4), 'aratio_cat']='aratio_2t4'
raw_data.ix[(raw_data.aratio>=4) & (raw_data.aratio<6), 'aratio_cat']='aratio_4t6'
raw_data.ix[(raw_data.aratio>=6) & (raw_data.aratio<8), 'aratio_cat']='aratio_6t8'
raw_data.ix[(raw_data.aratio>=8) & (raw_data.aratio<10), 'aratio_cat']='aratio_8t10'
raw_data.ix[(raw_data.aratio>=10), 'aratio_cat']='aratio_10t'
In [58]:
aspect_cats=pd.get_dummies(raw_data['aratio_cat'])
del aspect_cats['aratio_NaN'] #comparison category
del raw_data['aratio_cat']
aspect_cats.head()
Out[58]:
Height
In [59]:
raw_data['height_cat']='height_NaN'
raw_data.ix[(raw_data.height>=0) & (raw_data.height<50), 'height_cat']='height_0t50'
raw_data.ix[(raw_data.height>=50) & (raw_data.height<100), 'height_cat']='height_50t100'
raw_data.ix[(raw_data.height>=100) & (raw_data.height<150), 'height_cat']='height_100t150'
raw_data.ix[(raw_data.height>=150) & (raw_data.height<200), 'height_cat']='height_150t200'
raw_data.ix[(raw_data.height>=200) & (raw_data.height<250), 'height_cat']='height_200t250'
raw_data.ix[(raw_data.height>=250) & (raw_data.height<300), 'height_cat']='height_250t300'
raw_data.ix[(raw_data.height>=300) & (raw_data.height<350), 'height_cat']='height_300t350'
raw_data.ix[(raw_data.height>=350) & (raw_data.height<400), 'height_cat']='height_350t400'
raw_data.ix[(raw_data.height>=400), 'height_cat']='height_400t'
In [60]:
height_cats=pd.get_dummies(raw_data['height_cat'])
del height_cats['height_NaN'] #comparison category
del raw_data['height_cat']
height_cats.head()
Out[60]:
Width
In [61]:
raw_data['width_cat']='width_NaN'
raw_data.ix[(raw_data.width>=0) & (raw_data.width<50), 'width_cat']='width_0t50'
raw_data.ix[(raw_data.width>=50) & (raw_data.width<100), 'width_cat']='width_50t100'
raw_data.ix[(raw_data.width>=100) & (raw_data.width<150), 'width_cat']='width_100t150'
raw_data.ix[(raw_data.width>=150) & (raw_data.width<200), 'width_cat']='width_150t200'
raw_data.ix[(raw_data.width>=200) & (raw_data.width<250), 'width_cat']='width_200t250'
raw_data.ix[(raw_data.width>=250) & (raw_data.width<300), 'width_cat']='width_250t300'
raw_data.ix[(raw_data.width>=300) & (raw_data.width<350), 'width_cat']='width_300t350'
raw_data.ix[(raw_data.width>=350) & (raw_data.width<400), 'width_cat']='width_350t400'
raw_data.ix[(raw_data.width>=400), 'width_cat']='width_400t'
In [62]:
width_cats=pd.get_dummies(raw_data['width_cat'])
del width_cats['width_NaN'] #comparison category
del raw_data['width_cat']
width_cats.head()
Out[62]:
Switching the categorical with binary variables
In [63]:
del raw_data['height'], raw_data['width'], raw_data['aratio']
In [64]:
raw_data=pd.concat([height_cats,width_cats,aspect_cats,raw_data], axis=1)
raw_data.head()
Out[64]:
Without domain knowledge or clear business logic, turning continuous variables into a series of categorical variables is a mix of empiricism and guessing. I inspected the histograms and selected partitions that made sense. This part of the model building process can be refined through iteration.
Normally I pickle datasets. However, to make the code more portable, I will save it as a csv.
In [65]:
raw_data.to_csv("./clean_data/modeling_data.csv",index=False)