In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
In [66]:
# Loading the data again.
df = pd.read_fwf('https://raw.githubusercontent.com/borja876/Thinkful-DataScience-Borja/master/auto-mpg.data.txt', header=None)
df.columns = ["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration','modelyear','origin','carname']
df.head()
Out[66]:
In [67]:
df['cylinders'].unique()
Out[67]:
In [68]:
df['origin'].unique()
Out[68]:
In [69]:
df['modelyear'].unique()
Out[69]:
In [70]:
df['mpg'].unique()
Out[70]:
In [71]:
df['horsepower'].unique()
Out[71]:
In [72]:
df = df.drop( df[(df.horsepower == '?')].index )
df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']] = df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']].astype(float)
#df[['modelyear','origin']] = df[['modelyear','origin']].astype(object)
In [73]:
df.info()
df.head()
Out[73]:
Using a dataset of your choice, select an outcome variable and then pick four or five other variables (one to two categorical, three to four continuous) to act as the basis for features
Outcome variable: mpg Categorical variables: cylynders, origin and year Continuous: displacement, horsepower, weight, acceleration, displacement
In [74]:
#Plotting the relationships between variables
sns.set_style("white")
In [75]:
dfcont = df.drop(['carname','cylinders','modelyear','origin'], axis=1)
# Declare that you want to make a scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
# Scatterplot.
g.map_upper(plt.scatter, alpha=.5)
# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()
#Some warnings will show up below because the plot does not include a legend.
In [76]:
# Make the correlation matrix.
corrmat = dfcont.corr()
print(corrmat)
# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))
# Draw the heatmap using seaborn.
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
From the correlation matrix it seems that displacement, horsepower and weight are strongly correlated. Acceleration is less correlated with the rest thus providing more information.
In [77]:
df1 = df.drop(['carname'], axis=1)
df1.head()
Out[77]:
In [78]:
# Plot all the variables with boxplots
dfb = df1.drop(['origin','modelyear'], axis=1)
df_long = dfb
df_long = pd.melt(df_long, id_vars=['cylinders'])
g = sns.FacetGrid(df_long, col="variable",size=10, aspect=.5)
g = g.map(sns.boxplot, "cylinders", "value")
g.fig.get_axes()[0].set_yscale('log')
sns.despine(left=True)
plt.show()
For cylinders = 6 & 8: mpg, displacement & horsepower present outliers For cylinders = 4, acceleration present outliers
In [79]:
# Descriptive statistics by group.
df1.groupby('cylinders').describe().transpose()
Out[79]:
The number of counts for cylinders = 3 and 5 is very small so they are discarded considering only 4, 6 & 8
In [80]:
df1['cylinders'] = df1["cylinders"].astype(float)
df1 = df1.drop( df[(df.cylinders == 3.0)].index )
df1 = df1.drop( df[(df.cylinders == 5.0)].index )
In [81]:
df1['cylinders'] = df1['cylinders'].astype(str)
dffinal1 = df1[['cylinders','modelyear','origin','mpg','displacement','horsepower','weight','acceleration']]
dffinal1.head()
Out[81]:
In [82]:
dffinal1['cylinders'].unique()
Out[82]:
In [83]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '4.0'][col],
dffinal1[dffinal1['cylinders'] == '6.0'][col]
))
In [84]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '4.0'][col],
dffinal1[dffinal1['cylinders'] == '8.0'][col]
))
In [85]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '6.0'][col],
dffinal1[dffinal1['cylinders'] == '8.0'][col]
))
The difference for all variables for each cylinders value is significant (except for acceleration when comparing 4 & 6)
In [86]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x="modelyear", hue='cylinders', data=dffinal1, palette="Set3")
plt.show()
# Table of counts
counttable = pd.crosstab(dffinal1['modelyear'], dffinal1['cylinders'])
print(counttable)
In [87]:
print(stats.chisquare(counttable, axis=None))
Modelyear on average is equivalent regarding the population per year. There are differences regarding the cylinders values. The group size differences are large enough to reflect differences on the population.
In [88]:
#Feature 1: Standard number of cylinders vs high end number of cylinders
features = pd.get_dummies(dffinal1['cylinders'])
features['High_end'] = np.where((dffinal1['cylinders'].isin(['6.0', '8.0'])), 1, 0)
#print(pd.crosstab(features['High_end'], dffinal1['cylinders']))
In [89]:
#Feature 2: # Cars from the 70s and cars from the 80s.
features = pd.get_dummies(dffinal1['modelyear'])
features['decade'] = np.where((dffinal1['modelyear'].isin(range(70,80))), 1, 0)
#print(pd.crosstab(features['decade'], dffinal1['modelyear']))
In [90]:
# Feature 3: National cars vs imported cars
features = pd.get_dummies(dffinal1['origin'])
features['national'] = np.where((dffinal1['origin'].isin(['1'])), 1, 0)
#print(pd.crosstab(features['national'], dffinal1['origin']))
In [91]:
# Feature 4: Nacceleration: Normalized acceleration
# Making a four-panel plot.
fig = plt.figure()
fig.add_subplot(221)
plt.hist(dffinal1['acceleration'].dropna())
plt.title('Raw')
fig.add_subplot(222)
plt.hist(np.log(dffinal1['acceleration'].dropna()))
plt.title('Log')
fig.add_subplot(223)
plt.hist(np.sqrt(dffinal1['acceleration'].dropna()))
plt.title('Square root')
ax3=fig.add_subplot(224)
plt.hist(1/df['acceleration'].dropna())
plt.title('Inverse')
plt.show()
features['nacceleration'] = np.sqrt(dffinal1['acceleration'])
In [92]:
# Feature 5: CAR DHW. Composite of highly correlated variables
corrmat = dffinal1.corr()
# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))
# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
means = dffinal1[['displacement','horsepower','weight']].mean(axis=0)
stds = dffinal1[['displacement','horsepower','weight']].std(axis=0)
features['car_dhw'] = ((dffinal1[['displacement','horsepower','weight']] - means) / stds).mean(axis=1)
# Check how well the composite correlates with each of the individual variables.
plotdffinal1= dffinal1.loc[:, ['displacement','horsepower','weight']]
plotdffinal1['dhw'] = features['car_dhw']
corrmat2 = plotdffinal1.corr()
print(corrmat2)
In [93]:
# Feature 6: Carperformance. Relationship between car_dhw & nacceleration
features['carperformance'] = features['car_dhw'] * features['nacceleration']
# A plot of an interaction.
# Add the 'tvtot' feature to the features data frame for plotting.
features['mpg'] = dffinal1['mpg']
sns.lmplot(
x='carperformance',
y='mpg',
data=features,
scatter=False
)
plt.show()
In [94]:
# Feature 7: Carperformance (squared).
sns.regplot(
features['carperformance'],
y=dffinal1['mpg'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['carperformance_sq'] = features['carperformance'] * features['carperformance']
In [95]:
# Feature 7: standardised carperformance (squared).
means = features[['carperformance_sq']].mean(axis=0)
stds = features[['carperformance_sq']].std(axis=0)
features['standcarperformance_sq'] = ((features[['carperformance_sq']] - means) / stds).mean(axis=1)
In [96]:
# Feature 8: Acceleration (squared).
sns.regplot(
dffinal1['acceleration'],
y=dffinal1['mpg'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['acceleration_sq'] = dffinal1['acceleration'] * dffinal1['acceleration']
In [97]:
# Feature 9: Dhw composite value abs.
sns.regplot(
dffinal1['acceleration'],
y=features['car_dhw'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['dhw_abs'] = features['car_dhw'].abs()
In [98]:
# Select only numeric variables to scale.
df_num = features.select_dtypes(include=[np.number]).dropna()
# Save the column names.
names=df_num.columns
# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df_num), columns=names)
# The new features contain all the information of the old ones, but on a new scale.
plt.scatter(df_num['car_dhw'], df_scaled['car_dhw'])
plt.show()
# Lookit all those matching means and standard deviations!
print(df_scaled.describe())
In [109]:
# Normalize the data so that all variables have a mean of 0 and standard deviation
# of 1.
X = StandardScaler().fit_transform(df_scaled)
# The NumPy covariance function assumes that variables are represented by rows,
# not columns, so we transpose X.
Xt = X.T
Cx = np.cov(Xt)
print('Covariance Matrix:\n', Cx)
In [111]:
# Calculating eigenvalues and eigenvectors.
eig_val_cov, eig_vec_cov = np.linalg.eig(Cx)
# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_val_cov)):
eigvec_cov = eig_vec_cov[:, i].reshape(1, 12).T
print('Eigenvector {}: \n{}'.format(i + 1, eigvec_cov))
print('Eigenvalue {}: {}'.format(i + 1, eig_val_cov[i]))
print(40 * '-')
print(
'The percentage of total variance in the dataset explained by each',
'component calculated by hand.\n',
eig_val_cov / sum(eig_val_cov)
)
In [112]:
#From the Scree plot we should use onle the first 2 components that will explain 46% and 23% each
plt.plot(eig_val_cov)
plt.show()
In [114]:
# Create P, which we will use to transform Cx into Cy to get Y, the
# dimensionally-reduced representation of X.
P = eig_vec_cov[:, 0]
# Transform X into Y.
Y = P.T.dot(Xt)
# Combine X and Y for plotting purposes.
data_to_plot = df_scaled[['nacceleration','car_dhw','carperformance','carperformance_sq','standcarperformance_sq','acceleration_sq','dhw_abs']]
data_to_plot['Component'] = Y
data_to_plot = pd.melt(data_to_plot, id_vars='Component')
g = sns.FacetGrid(data_to_plot, col="variable", size=4, aspect=.5)
g = g.map(
sns.regplot,
"Component",
"value",
x_jitter=.49,
y_jitter=.49,
fit_reg=False
)
plt.show()
In [121]:
sklearn_pca = PCA(n_components=5)
Y_sklearn = sklearn_pca.fit_transform(X)
print(
'The percentage of total variance in the dataset explained by each',
'component from Sklearn PCA.\n',
sklearn_pca.explained_variance_ratio_
)
# Compare the sklearn solution to ours – a perfect match.
plt.plot(Y_sklearn[:, 0], Y, 'o')
plt.title('Comparing solutions')
plt.ylabel('Sklearn Component 1')
plt.xlabel('By-hand Component 1')
plt.show()
In [ ]: