In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
% matplotlib inline
In [2]:
df = pd.read_csv('data/wheat-2013-supervised-edited.csv')
df.drop(df.columns[0],axis=1,inplace=True)
df.head()
Out[2]:
In [3]:
df.shape
Out[3]:
Generally, I view the distribution of the target variable to ensure that the models will be trained on "balanced" data. Upon review, the distribution is fairly balanced. Based off this distribution, I would be comfortable with a 75% (~133,000) train size and 25% (~44,000) test size as my train/test split.
In [4]:
col = 'Yield'
figs,axes = plt.subplots(nrows=1,ncols=2)
figs.set_figwidth(12)
figs.set_figheight(5)
df[col].plot(kind='kde', ax=axes[0], grid=True, title='KDE:'+col)
df[col].plot(kind='hist',ax=axes[1], grid=True, title='HIST:'+col)
plt.show()
I plotted the KDE and histograms of all features except "precipTypeIsOther"; since it has zero variance. Viewing the distributions loosely helps me evaluate the importance of each one. Later on, I will cross check with coefficients and/or feature importances to validate whether or not to keep a feature.
In [5]:
i=0
figs,axes = plt.subplots(nrows=len(df.columns[:-1]),ncols=2)
figs.set_figwidth(12)
figs.set_figheight(5*len(df.columns[:-1]))
for col in df.columns[:-1]:
i+=1
if abs(df[col].max()) >= 1:
df[col].plot(kind='kde', ax=axes[-1+i,0], grid=True, title='KDE:'+col)
df[col].plot(kind='hist',ax=axes[-1+i,1], grid=True, title='HIST:'+col)
elif abs(df[col].max()) < 1:
df[col].plot(kind='kde', ax=axes[-1+i,0], grid=True, title='KDE:'+col)
df[col].plot(kind='kde', ax=axes[-1+i,1], grid=True, title='KDE(log(x)):'+col, logx=True)
In [6]:
df.dropna(inplace=True)
df.drop(df.columns[:5],axis=1,inplace=True)
sns.pairplot(df)
Out[6]:
In [7]:
cols = list(df.columns)
cm = np.corrcoef(df[cols].values.T)
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(cm,cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size':5},
yticklabels=cols,
xticklabels=cols)
plt.show()