In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
In [146]:
# Loading the data again.
df = pd.read_fwf('https://raw.githubusercontent.com/borja876/Thinkful-DataScience-Borja/master/auto-mpg.data.txt', header=None)
df.columns = ["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration','modelyear','origin','carname']
df.head()
Out[146]:
In [147]:
df['cylinders'].unique()
Out[147]:
In [148]:
df['origin'].unique()
Out[148]:
In [149]:
df['modelyear'].unique()
Out[149]:
In [150]:
df['mpg'].unique()
Out[150]:
In [151]:
df['horsepower'].unique()
Out[151]:
In [152]:
df = df.drop( df[(df.horsepower == '?')].index )
df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']] = df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']].astype(float)
#df[['modelyear','origin']] = df[['modelyear','origin']].astype(object)
In [153]:
df.info()
df.head()
Out[153]:
Using a dataset of your choice, select an outcome variable and then pick four or five other variables (one to two categorical, three to four continuous) to act as the basis for features
Outcome variable: mpg Categorical variables: cylynders, origin and year Continuous: displacement, horsepower, weight, acceleration, displacement
In [154]:
#Plotting the relationships between variables
sns.set_style("white")
In [155]:
dfcont = df.drop(['carname','cylinders','modelyear','origin'], axis=1)
# Declare that you want to make a scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
# Scatterplot.
g.map_upper(plt.scatter, alpha=.5)
# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()
#Some warnings will show up below because the plot does not include a legend.
In [156]:
# Make the correlation matrix.
corrmat = dfcont.corr()
print(corrmat)
# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))
# Draw the heatmap using seaborn.
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
From the correlation matrix it seems that displacement, horsepower and weight are strongly correlated. Acceleration is less correlated with the rest thus providing more information.
In [157]:
df1 = df.drop(['carname'], axis=1)
df1.head()
Out[157]:
In [158]:
# Plot all the variables with boxplots
dfb = df1.drop(['origin','modelyear'], axis=1)
df_long = dfb
df_long = pd.melt(df_long, id_vars=['cylinders'])
g = sns.FacetGrid(df_long, col="variable",size=10, aspect=.5)
g = g.map(sns.boxplot, "cylinders", "value")
g.fig.get_axes()[0].set_yscale('log')
sns.despine(left=True)
plt.show()
For cylinders = 6 & 8: mpg, displacement & horsepower present outliers For cylinders = 4, acceleration present outliers
In [159]:
# Descriptive statistics by group.
df1.groupby('cylinders').describe().transpose()
Out[159]:
The number of counts for cylinders = 3 and 5 is very small so they are discarded considering only 4, 6 & 8
In [160]:
df1['cylinders'] = df1["cylinders"].astype(float)
df1 = df1.drop( df[(df.cylinders == 3.0)].index )
df1 = df1.drop( df[(df.cylinders == 5.0)].index )
In [161]:
df1['cylinders'] = df1['cylinders'].astype(str)
dffinal1 = df1[['cylinders','modelyear','origin','mpg','displacement','horsepower','weight','acceleration']]
dffinal1.head()
Out[161]:
In [162]:
dffinal1['cylinders'].unique()
Out[162]:
In [163]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '4.0'][col],
dffinal1[dffinal1['cylinders'] == '6.0'][col]
))
In [164]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '4.0'][col],
dffinal1[dffinal1['cylinders'] == '8.0'][col]
))
In [165]:
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
print(col)
print(stats.ttest_ind(
dffinal1[dffinal1['cylinders'] == '6.0'][col],
dffinal1[dffinal1['cylinders'] == '8.0'][col]
))
The difference for all variables for each cylinders value is significant (except for acceleration when comparing 4 & 6)
In [166]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x="modelyear", hue='cylinders', data=dffinal1, palette="Set3")
plt.show()
# Table of counts
counttable = pd.crosstab(dffinal1['modelyear'], dffinal1['cylinders'])
print(counttable)
In [167]:
print(stats.chisquare(counttable, axis=None))
Modelyear on average is equivalent regarding the population per year. There are differences regarding the cylinders values. The group size differences are large enough to reflect differences on the population.
In [168]:
#Feature 1: Standard number of cylinders vs high end number of cylinders
features = pd.get_dummies(dffinal1['cylinders'])
features['High_end'] = np.where((dffinal1['cylinders'].isin(['6.0', '8.0'])), 1, 0)
#print(pd.crosstab(features['High_end'], dffinal1['cylinders']))
In [169]:
#Feature 2: # Cars from the 70s and cars from the 80s.
features = pd.get_dummies(dffinal1['modelyear'])
features['decade'] = np.where((dffinal1['modelyear'].isin(range(70,80))), 1, 0)
#print(pd.crosstab(features['decade'], dffinal1['modelyear']))
In [170]:
# Feature 3: National cars vs imported cars
features = pd.get_dummies(dffinal1['origin'])
features['national'] = np.where((dffinal1['origin'].isin(['1'])), 1, 0)
#print(pd.crosstab(features['national'], dffinal1['origin']))
In [171]:
# Feature 4: Nacceleration: Normalized acceleration
# Making a four-panel plot.
fig = plt.figure()
fig.add_subplot(221)
plt.hist(dffinal1['acceleration'].dropna())
plt.title('Raw')
fig.add_subplot(222)
plt.hist(np.log(dffinal1['acceleration'].dropna()))
plt.title('Log')
fig.add_subplot(223)
plt.hist(np.sqrt(dffinal1['acceleration'].dropna()))
plt.title('Square root')
ax3=fig.add_subplot(224)
plt.hist(1/df['acceleration'].dropna())
plt.title('Inverse')
plt.show()
features['nacceleration'] = np.sqrt(dffinal1['acceleration'])
In [172]:
# Feature 5: CAR DHW. Composite of highly correlated variables
corrmat = dffinal1.corr()
# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))
# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
means = dffinal1[['displacement','horsepower','weight']].mean(axis=0)
stds = dffinal1[['displacement','horsepower','weight']].std(axis=0)
features['car_dhw'] = ((dffinal1[['displacement','horsepower','weight']] - means) / stds).mean(axis=1)
# Check how well the composite correlates with each of the individual variables.
plotdffinal1= dffinal1.loc[:, ['displacement','horsepower','weight']]
plotdffinal1['dhw'] = features['car_dhw']
corrmat2 = plotdffinal1.corr()
print(corrmat2)
In [173]:
# Feature 6: Carperformance. Relationship between car_dhw & nacceleration
features['carperformance'] = features['car_dhw'] * features['nacceleration']
# A plot of an interaction.
# Add the 'tvtot' feature to the features data frame for plotting.
features['mpg'] = dffinal1['mpg']
sns.lmplot(
x='carperformance',
y='mpg',
data=features,
scatter=False
)
plt.show()
In [174]:
# Feature 7: Carperformance (squared).
sns.regplot(
features['carperformance'],
y=dffinal1['mpg'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['carperformance_sq'] = features['carperformance'] * features['carperformance']
In [175]:
# Feature 7: standardised carperformance (squared).
means = features[['carperformance_sq']].mean(axis=0)
stds = features[['carperformance_sq']].std(axis=0)
features['standcarperformance_sq'] = ((features[['carperformance_sq']] - means) / stds).mean(axis=1)
In [176]:
# Feature 8: Acceleration (squared).
sns.regplot(
dffinal1['acceleration'],
y=dffinal1['mpg'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['acceleration_sq'] = dffinal1['acceleration'] * dffinal1['acceleration']
In [177]:
# Feature 9: Dhw composite value abs.
sns.regplot(
dffinal1['acceleration'],
y=features['car_dhw'],
y_jitter=.49,
order=2,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'},
ci=None
)
plt.show()
features['dhw_abs'] = features['car_dhw'].abs()
In [178]:
# Select only numeric variables to scale.
df_num = features.select_dtypes(include=[np.number]).dropna()
# Save the column names.
names=df_num.columns
# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df_num), columns=names)
# The new features contain all the information of the old ones, but on a new scale.
plt.scatter(df_num['car_dhw'], df_scaled['car_dhw'])
plt.show()
# Lookit all those matching means and standard deviations!
print(df_scaled.describe())
In [179]:
predictors = ['national','nacceleration','car_dhw','carperformance','carperformance_sq','standcarperformance_sq','acceleration_sq','dhw_abs']
# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(df_scaled[predictors], dffinal1["mpg"])
# Get the raw p-values for each feature, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)
# Plot the scores. See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()
In [ ]: