In [24]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
In [25]:
url ='https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine_df = pd.read_csv(url, sep=';')
In [26]:
wine_df.head()
Out[26]:
In [27]:
wine_df = wine_df.rename(columns={c: c.lower().replace(' ', '_') for c in wine_df.columns})
In [28]:
wine_df.head()
Out[28]:
In [29]:
wine_df.info()
In [30]:
wine_df.describe().T
Out[30]:
In [31]:
feature_means = wine_df.drop('quality', axis=1).mean(axis=0)
feature_medians = wine_df.drop('quality', axis=1).median(axis=0)
feature_means - feature_medians
Out[31]:
All the features except density has mean > median. This indicates possibility of presence of outliers.
In [32]:
for feature_name in wine_df.keys():
if feature_name != 'quality':
wine_df.boxplot(by='quality', column=feature_name)
In [47]:
for feature_name in wine_df.keys():
if feature_name != 'quality':
pylab.figure()
ax = wine_df[feature_name].plot(kind='hist', bins=20)
ax.set(xlabel=feature_name)
In [34]:
feature_corrs = wine_df.drop('quality', axis=1).corr()
In [35]:
import seaborn as sns
sns.heatmap(feature_corrs, annot=True)
Out[35]:
In [36]:
features_df = wine_df.drop('quality', axis=1)
features_df.corrwith(wine_df['quality']).plot(kind='barh')
Out[36]:
In [37]:
wine_df['quality'].value_counts().sort_index().plot(kind='barh')
Out[37]:
In [38]:
import statsmodels.formula.api as sm
model = sm.ols(formula='quality ~ alcohol + volatile_acidity', data=wine_df)
result = model.fit()
result.summary()
Out[38]:
In [39]:
import statsmodels.formula.api as sm
formula = ' quality ~ ' + ' + '.join([c for c in wine_df.keys() if c != 'quality'])
model_all = sm.ols(formula=formula, data=wine_df)
result_all = model_all.fit()
result_all.summary()
Out[39]:
In [40]:
wine_df.sort_values('quality').plot(kind='scatter', x='quality', y='alcohol')
Out[40]:
In [41]:
wine_df.sort_values('quality').plot(kind='scatter', x='quality', y='citric_acid')
Out[41]:
In [42]:
result_all.resid.plot(kind='hist', bins=20)
Out[42]:
In [43]:
feature_cov = wine_df.drop('quality', axis=1).cov()
np.linalg.matrix_rank(feature_cov)
Out[43]:
In [44]:
import sklearn.preprocessing as prep
feature_df = wine_df.drop('quality', axis=1)
target_df = wine_df['quality']
scaler = prep.StandardScaler().fit_transform(feature_df)
scaled_features_df = pd.DataFrame(data=scaler, index=target_df.index, columns=feature_df.columns)
In [45]:
scaled_features_df.head()
Out[45]:
In [46]:
from sklearn.decomposition import PCA
var_explained = {}
for index, _ in enumerate(scaled_features_df.columns):
pca = PCA(n_components=index+1)
result = pca.fit(scaled_features_df)
exp_var_pct = np.sum(result.explained_variance_ratio_) * 100.0
var_explained[index+1] = exp_var_pct
ax = pd.Series(var_explained).plot(kind='bar')
ax.set(xlabel='Number of principal components', ylabel='Pct. of Variance Explained')
Out[46]:
In [ ]: