In [65]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats



In [66]:

    
# Loading the data again.

df = pd.read_fwf('https://raw.githubusercontent.com/borja876/Thinkful-DataScience-Borja/master/auto-mpg.data.txt', header=None)
df.columns = ["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration','modelyear','origin','carname']
df.head()









    Out[66]:







  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      modelyear
      origin
      carname
    
  
  
    
      0
      18.0
      8
      307.0
      130.0
      3504.0
      12.0
      70
      1
      "chevrolet chevelle malibu"
    
    
      1
      15.0
      8
      350.0
      165.0
      3693.0
      11.5
      70
      1
      "buick skylark 320"
    
    
      2
      18.0
      8
      318.0
      150.0
      3436.0
      11.0
      70
      1
      "plymouth satellite"
    
    
      3
      16.0
      8
      304.0
      150.0
      3433.0
      12.0
      70
      1
      "amc rebel sst"
    
    
      4
      17.0
      8
      302.0
      140.0
      3449.0
      10.5
      70
      1
      "ford torino"



In [67]:

    
df['cylinders'].unique()









    Out[67]:





array([8, 4, 6, 3, 5], dtype=int64)



In [68]:

    
df['origin'].unique()









    Out[68]:





array([1, 3, 2], dtype=int64)



In [69]:

    
df['modelyear'].unique()









    Out[69]:





array([70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82], dtype=int64)



In [70]:

    
df['mpg'].unique()









    Out[70]:





array([ 18. ,  15. ,  16. ,  17. ,  14. ,  24. ,  22. ,  21. ,  27. ,
        26. ,  25. ,  10. ,  11. ,   9. ,  28. ,  19. ,  12. ,  13. ,
        23. ,  30. ,  31. ,  35. ,  20. ,  29. ,  32. ,  33. ,  17.5,
        15.5,  14.5,  22.5,  24.5,  18.5,  29.5,  26.5,  16.5,  31.5,
        36. ,  25.5,  33.5,  20.5,  30.5,  21.5,  43.1,  36.1,  32.8,
        39.4,  19.9,  19.4,  20.2,  19.2,  25.1,  20.6,  20.8,  18.6,
        18.1,  17.7,  27.5,  27.2,  30.9,  21.1,  23.2,  23.8,  23.9,
        20.3,  21.6,  16.2,  19.8,  22.3,  17.6,  18.2,  16.9,  31.9,
        34.1,  35.7,  27.4,  25.4,  34.2,  34.5,  31.8,  37.3,  28.4,
        28.8,  26.8,  41.5,  38.1,  32.1,  37.2,  26.4,  24.3,  19.1,
        34.3,  29.8,  31.3,  37. ,  32.2,  46.6,  27.9,  40.8,  44.3,
        43.4,  36.4,  44.6,  40.9,  33.8,  32.7,  23.7,  23.6,  32.4,
        26.6,  25.8,  23.5,  39.1,  39. ,  35.1,  32.3,  37.7,  34.7,
        34.4,  29.9,  33.7,  32.9,  31.6,  28.1,  30.7,  24.2,  22.4,
        34. ,  38. ,  44. ])



In [71]:

    
df['horsepower'].unique()









    Out[71]:





array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)



In [72]:

    
df = df.drop( df[(df.horsepower == '?')].index )
df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']] = df[["mpg", "cylinders", "displacement", "horsepower", 'weight', 'acceleration']].astype(float)
#df[['modelyear','origin']] = df[['modelyear','origin']].astype(object)



In [73]:

    
df.info()
df.head()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
mpg             392 non-null float64
cylinders       392 non-null float64
displacement    392 non-null float64
horsepower      392 non-null float64
weight          392 non-null float64
acceleration    392 non-null float64
modelyear       392 non-null int64
origin          392 non-null int64
carname         392 non-null object
dtypes: float64(6), int64(2), object(1)
memory usage: 29.1+ KB






    Out[73]:







  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      modelyear
      origin
      carname
    
  
  
    
      0
      18.0
      8.0
      307.0
      130.0
      3504.0
      12.0
      70
      1
      "chevrolet chevelle malibu"
    
    
      1
      15.0
      8.0
      350.0
      165.0
      3693.0
      11.5
      70
      1
      "buick skylark 320"
    
    
      2
      18.0
      8.0
      318.0
      150.0
      3436.0
      11.0
      70
      1
      "plymouth satellite"
    
    
      3
      16.0
      8.0
      304.0
      150.0
      3433.0
      12.0
      70
      1
      "amc rebel sst"
    
    
      4
      17.0
      8.0
      302.0
      140.0
      3449.0
      10.5
      70
      1
      "ford torino"

Features for a car from the 70s or 80s that will give better mpg

Using a dataset of your choice, select an outcome variable and then pick four or five other variables (one to two categorical, three to four continuous) to act as the basis for features

Outcome variable: mpg Categorical variables: cylynders, origin and year Continuous: displacement, horsepower, weight, acceleration, displacement



In [74]:

    
#Plotting the relationships between variables
sns.set_style("white")



In [75]:

    
dfcont = df.drop(['carname','cylinders','modelyear','origin'], axis=1)
# Declare that you want to make a scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
# Scatterplot.
g.map_upper(plt.scatter, alpha=.5)
# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))
# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()

#Some warnings will show up below because the plot does not include a legend.









    



d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "



In [76]:

    
# Make the correlation matrix.
corrmat = dfcont.corr()
print(corrmat)

# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn.
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()









    



                   mpg  displacement  horsepower    weight  acceleration
mpg           1.000000     -0.805127   -0.778427 -0.832244      0.423329
displacement -0.805127      1.000000    0.897257  0.932994     -0.543800
horsepower   -0.778427      0.897257    1.000000  0.864538     -0.689196
weight       -0.832244      0.932994    0.864538  1.000000     -0.416839
acceleration  0.423329     -0.543800   -0.689196 -0.416839      1.000000

From the correlation matrix it seems that displacement, horsepower and weight are strongly correlated. Acceleration is less correlated with the rest thus providing more information.



In [77]:

    
df1 = df.drop(['carname'], axis=1)
df1.head()









    Out[77]:







  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      modelyear
      origin
    
  
  
    
      0
      18.0
      8.0
      307.0
      130.0
      3504.0
      12.0
      70
      1
    
    
      1
      15.0
      8.0
      350.0
      165.0
      3693.0
      11.5
      70
      1
    
    
      2
      18.0
      8.0
      318.0
      150.0
      3436.0
      11.0
      70
      1
    
    
      3
      16.0
      8.0
      304.0
      150.0
      3433.0
      12.0
      70
      1
    
    
      4
      17.0
      8.0
      302.0
      140.0
      3449.0
      10.5
      70
      1



In [78]:

    
# Plot all the variables with boxplots
dfb = df1.drop(['origin','modelyear'], axis=1)
df_long = dfb
df_long = pd.melt(df_long, id_vars=['cylinders'])


g = sns.FacetGrid(df_long, col="variable",size=10, aspect=.5)
g = g.map(sns.boxplot, "cylinders", "value")
g.fig.get_axes()[0].set_yscale('log')
sns.despine(left=True)
plt.show()









    



d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\seaborn\axisgrid.py:703: UserWarning: Using the boxplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)

For cylinders = 6 & 8: mpg, displacement & horsepower present outliers For cylinders = 4, acceleration present outliers



In [79]:

    
# Descriptive statistics by group.
df1.groupby('cylinders').describe().transpose()









    Out[79]:







  
    
      
      cylinders
      3.0
      4.0
      5.0
      6.0
      8.0
    
  
  
    
      acceleration
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      13.250000
      16.581910
      18.633333
      16.254217
      12.955340
    
    
      std
      0.500000
      2.383185
      2.369247
      2.031778
      2.224759
    
    
      min
      12.500000
      11.600000
      15.900000
      11.300000
      8.000000
    
    
      25%
      13.250000
      14.800000
      17.900000
      15.050000
      11.500000
    
    
      50%
      13.500000
      16.200000
      19.900000
      16.000000
      13.000000
    
    
      75%
      13.500000
      18.000000
      20.000000
      17.600000
      14.000000
    
    
      max
      13.500000
      24.800000
      20.100000
      21.000000
      22.200000
    
    
      displacement
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      72.500000
      109.670854
      145.000000
      218.361446
      345.009709
    
    
      std
      5.000000
      21.376813
      33.286634
      32.427246
      46.776376
    
    
      min
      70.000000
      68.000000
      121.000000
      145.000000
      260.000000
    
    
      25%
      70.000000
      91.000000
      126.000000
      199.500000
      305.000000
    
    
      50%
      70.000000
      105.000000
      131.000000
      231.000000
      350.000000
    
    
      75%
      72.500000
      121.000000
      157.000000
      250.000000
      360.000000
    
    
      max
      80.000000
      156.000000
      183.000000
      262.000000
      455.000000
    
    
      horsepower
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      99.250000
      78.281407
      82.333333
      101.506024
      158.300971
    
    
      std
      8.301606
      14.523099
      18.583146
      14.310472
      28.453552
    
    
      min
      90.000000
      46.000000
      67.000000
      72.000000
      90.000000
    
    
      25%
      95.250000
      68.000000
      72.000000
      92.500000
      140.000000
    
    
      50%
      98.500000
      78.000000
      77.000000
      100.000000
      150.000000
    
    
      75%
      102.500000
      88.000000
      90.000000
      110.000000
      175.000000
    
    
      max
      110.000000
      115.000000
      103.000000
      165.000000
      230.000000
    
    
      modelyear
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      75.500000
      77.030151
      79.000000
      75.951807
      73.902913
    
    
      std
      3.696846
      3.737484
      1.000000
      3.264381
      3.021214
    
    
      min
      72.000000
      70.000000
      78.000000
      70.000000
      70.000000
    
    
      25%
      72.750000
      74.000000
      78.500000
      74.000000
      72.000000
    
    
      50%
      75.000000
      77.000000
      79.000000
      76.000000
      73.000000
    
    
      75%
      77.750000
      80.000000
      79.500000
      78.000000
      76.000000
    
    
      max
      80.000000
      82.000000
      80.000000
      82.000000
      81.000000
    
    
      mpg
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      20.550000
      29.283920
      27.366667
      19.973494
      14.963107
    
    
      std
      2.564501
      5.670546
      8.228204
      3.828809
      2.836284
    
    
      min
      18.000000
      18.000000
      20.300000
      15.000000
      9.000000
    
    
      25%
      18.750000
      25.000000
      22.850000
      18.000000
      13.000000
    
    
      50%
      20.250000
      28.400000
      25.400000
      19.000000
      14.000000
    
    
      75%
      22.050000
      32.950000
      30.900000
      21.000000
      16.000000
    
    
      max
      23.700000
      46.600000
      36.400000
      38.000000
      26.600000
    
    
      origin
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      3.000000
      2.000000
      2.000000
      1.192771
      1.000000
    
    
      std
      0.000000
      0.834847
      0.000000
      0.551225
      0.000000
    
    
      min
      3.000000
      1.000000
      2.000000
      1.000000
      1.000000
    
    
      25%
      3.000000
      1.000000
      2.000000
      1.000000
      1.000000
    
    
      50%
      3.000000
      2.000000
      2.000000
      1.000000
      1.000000
    
    
      75%
      3.000000
      3.000000
      2.000000
      1.000000
      1.000000
    
    
      max
      3.000000
      3.000000
      2.000000
      3.000000
      1.000000
    
    
      weight
      count
      4.000000
      199.000000
      3.000000
      83.000000
      103.000000
    
    
      mean
      2398.500000
      2305.110553
      3103.333333
      3202.120482
      4114.718447
    
    
      std
      247.566153
      342.872223
      374.343870
      332.383425
      448.833159
    
    
      min
      2124.000000
      1613.000000
      2830.000000
      2472.000000
      3086.000000
    
    
      25%
      2278.500000
      2047.500000
      2890.000000
      2945.000000
      3799.000000
    
    
      50%
      2375.000000
      2230.000000
      2950.000000
      3210.000000
      4140.000000
    
    
      75%
      2495.000000
      2562.500000
      3240.000000
      3431.000000
      4403.500000
    
    
      max
      2720.000000
      3270.000000
      3530.000000
      3907.000000
      5140.000000

The number of counts for cylinders = 3 and 5 is very small so they are discarded considering only 4, 6 & 8



In [80]:

    
df1['cylinders'] = df1["cylinders"].astype(float)
df1 = df1.drop( df[(df.cylinders == 3.0)].index )
df1 = df1.drop( df[(df.cylinders == 5.0)].index )



In [81]:

    
df1['cylinders'] = df1['cylinders'].astype(str)
dffinal1 = df1[['cylinders','modelyear','origin','mpg','displacement','horsepower','weight','acceleration']]
dffinal1.head()









    Out[81]:







  
    
      
      cylinders
      modelyear
      origin
      mpg
      displacement
      horsepower
      weight
      acceleration
    
  
  
    
      0
      8.0
      70
      1
      18.0
      307.0
      130.0
      3504.0
      12.0
    
    
      1
      8.0
      70
      1
      15.0
      350.0
      165.0
      3693.0
      11.5
    
    
      2
      8.0
      70
      1
      18.0
      318.0
      150.0
      3436.0
      11.0
    
    
      3
      8.0
      70
      1
      16.0
      304.0
      150.0
      3433.0
      12.0
    
    
      4
      8.0
      70
      1
      17.0
      302.0
      140.0
      3449.0
      10.5



In [82]:

    
dffinal1['cylinders'].unique()









    Out[82]:





array(['8.0', '4.0', '6.0'], dtype=object)



In [83]:

    
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
    print(col)
    print(stats.ttest_ind(
        dffinal1[dffinal1['cylinders'] == '4.0'][col],
        dffinal1[dffinal1['cylinders'] == '6.0'][col]
    ))









    



mpg
Ttest_indResult(statistic=13.704896898546142, pvalue=4.6176516348996581e-33)
displacement
Ttest_indResult(statistic=-33.112179392494404, pvalue=8.0188842366671865e-99)
horsepower
Ttest_indResult(statistic=-12.290984157717734, pvalue=4.6614791369195407e-28)
weight
Ttest_indResult(statistic=-20.200932585820645, pvalue=1.3281556565135272e-56)
acceleration
Ttest_indResult(statistic=1.0971236695009552, pvalue=0.27352996514284744)



In [84]:

    
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
    print(col)
    print(stats.ttest_ind(
        dffinal1[dffinal1['cylinders'] == '4.0'][col],
        dffinal1[dffinal1['cylinders'] == '8.0'][col]
    ))









    



mpg
Ttest_indResult(statistic=24.103973896854541, pvalue=3.7528790110858902e-72)
displacement
Ttest_indResult(statistic=-59.960781625803044, pvalue=4.6497557842603817e-169)
horsepower
Ttest_indResult(statistic=-32.380959782114893, pvalue=6.4048618449507412e-100)
weight
Ttest_indResult(statistic=-39.005421190072092, pvalue=1.6157896144922786e-119)
acceleration
Ttest_indResult(statistic=12.819858346505718, pvalue=2.6761671258949308e-30)



In [85]:

    
for col in dffinal1.loc[:,'mpg':'acceleration'].columns:
    print(col)
    print(stats.ttest_ind(
        dffinal1[dffinal1['cylinders'] == '6.0'][col],
        dffinal1[dffinal1['cylinders'] == '8.0'][col]
    ))









    



mpg
Ttest_indResult(statistic=10.2452351142847, pvalue=8.9194661494413186e-20)
displacement
Ttest_indResult(statistic=-20.938533947460595, pvalue=1.4173445767138258e-50)
horsepower
Ttest_indResult(statistic=-16.568640256498949, pvalue=2.4950170362742587e-38)
weight
Ttest_indResult(statistic=-15.42376292075258, pvalue=5.4480493223937389e-35)
acceleration
Ttest_indResult(statistic=10.446470409357158, pvalue=2.3632766141946463e-20)

The difference for all variables for each cylinders value is significant (except for acceleration when comparing 4 & 6)



In [86]:

    
plt.figure(figsize=(20,5))
ax = sns.countplot(x="modelyear", hue='cylinders', data=dffinal1, palette="Set3")
plt.show()

# Table of counts
counttable = pd.crosstab(dffinal1['modelyear'], dffinal1['cylinders'])
print(counttable)









    












    



cylinders  4.0  6.0  8.0
modelyear               
70           7    4   18
71          12    8    7
72          14    0   13
73          11    8   20
74          15    6    5
75          12   12    6
76          15   10    9
77          14    5    8
78          17   12    6
79          12    6   10
80          23    2    0
81          20    7    1
82          27    3    0



In [87]:

    
print(stats.chisquare(counttable, axis=None))









    



Power_divergenceResult(statistic=158.67012987012987, pvalue=1.0939969858940465e-16)

Modelyear on average is equivalent regarding the population per year. There are differences regarding the cylinders values. The group size differences are large enough to reflect differences on the population.

Create 10 new features



In [88]:

    
#Feature 1: Standard number of cylinders vs high end number of cylinders

features = pd.get_dummies(dffinal1['cylinders'])
features['High_end'] = np.where((dffinal1['cylinders'].isin(['6.0', '8.0'])), 1, 0)
#print(pd.crosstab(features['High_end'], dffinal1['cylinders']))



In [89]:

    
#Feature 2: # Cars from the 70s and cars from the 80s.

features = pd.get_dummies(dffinal1['modelyear'])
features['decade'] = np.where((dffinal1['modelyear'].isin(range(70,80))), 1, 0)
#print(pd.crosstab(features['decade'], dffinal1['modelyear']))



In [90]:

    
# Feature 3: National cars vs imported cars

features = pd.get_dummies(dffinal1['origin'])
features['national'] = np.where((dffinal1['origin'].isin(['1'])), 1, 0)
#print(pd.crosstab(features['national'], dffinal1['origin']))



In [91]:

    
# Feature 4: Nacceleration: Normalized acceleration
# Making a four-panel plot.
fig = plt.figure()

fig.add_subplot(221)
plt.hist(dffinal1['acceleration'].dropna())
plt.title('Raw')

fig.add_subplot(222)
plt.hist(np.log(dffinal1['acceleration'].dropna()))
plt.title('Log')

fig.add_subplot(223)
plt.hist(np.sqrt(dffinal1['acceleration'].dropna()))
plt.title('Square root')

ax3=fig.add_subplot(224)
plt.hist(1/df['acceleration'].dropna())
plt.title('Inverse')
plt.show()


features['nacceleration'] = np.sqrt(dffinal1['acceleration'])



In [92]:

    
# Feature 5: CAR DHW. Composite of highly correlated variables

corrmat = dffinal1.corr()

# Set up the matplotlib figure.
f, ax = plt.subplots(figsize=(12, 9))

# Draw the heatmap using seaborn
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()


means = dffinal1[['displacement','horsepower','weight']].mean(axis=0)
stds = dffinal1[['displacement','horsepower','weight']].std(axis=0)
features['car_dhw'] = ((dffinal1[['displacement','horsepower','weight']] - means) / stds).mean(axis=1)

# Check how well the composite correlates with each of the individual variables.
plotdffinal1= dffinal1.loc[:, ['displacement','horsepower','weight']]
plotdffinal1['dhw'] = features['car_dhw'] 
corrmat2 = plotdffinal1.corr()

print(corrmat2)









    












    



              displacement  horsepower    weight       dhw
displacement      1.000000    0.902830  0.935296  0.978428
horsepower        0.902830    1.000000  0.868906  0.955540
weight            0.935296    0.868906  1.000000  0.966732
dhw               0.978428    0.955540  0.966732  1.000000



In [93]:

    
# Feature 6: Carperformance. Relationship between car_dhw & nacceleration
features['carperformance'] = features['car_dhw'] * features['nacceleration']

# A plot of an interaction.
# Add the 'tvtot' feature to the features data frame for plotting.
features['mpg'] = dffinal1['mpg']
sns.lmplot(
    x='carperformance',
    y='mpg',

    data=features,
    scatter=False
)
plt.show()



In [94]:

    
# Feature 7: Carperformance (squared).
sns.regplot(
    features['carperformance'],
    y=dffinal1['mpg'],
    y_jitter=.49,
    order=2,
    scatter_kws={'alpha':0.3},
    line_kws={'color':'black'},
    ci=None
)
plt.show()

features['carperformance_sq'] = features['carperformance'] * features['carperformance']



In [95]:

    
# Feature 7: standardised carperformance (squared).
means = features[['carperformance_sq']].mean(axis=0)
stds = features[['carperformance_sq']].std(axis=0)
features['standcarperformance_sq'] = ((features[['carperformance_sq']] - means) / stds).mean(axis=1)



In [96]:

    
# Feature 8: Acceleration (squared).
sns.regplot(
    dffinal1['acceleration'],
    y=dffinal1['mpg'],
    y_jitter=.49,
    order=2,
    scatter_kws={'alpha':0.3},
    line_kws={'color':'black'},
    ci=None
)
plt.show()

features['acceleration_sq'] = dffinal1['acceleration'] * dffinal1['acceleration']



In [97]:

    
# Feature 9: Dhw composite value abs.
sns.regplot(
    dffinal1['acceleration'],
    y=features['car_dhw'],
    y_jitter=.49,
    order=2,
    scatter_kws={'alpha':0.3},
    line_kws={'color':'black'},
    ci=None
)
plt.show()

features['dhw_abs'] = features['car_dhw'].abs()



In [98]:

    
# Select only numeric variables to scale.
df_num = features.select_dtypes(include=[np.number]).dropna()

# Save the column names.
names=df_num.columns

# Scale, then turn the resulting numpy array back into a data frame with the correct column names.
df_scaled = pd.DataFrame(preprocessing.scale(df_num), columns=names)

# The new features contain all the information of the old ones, but on a new scale.
plt.scatter(df_num['car_dhw'], df_scaled['car_dhw'])
plt.show()

# Lookit all those matching means and standard deviations!
print(df_scaled.describe())









    












    



                  1             2           3      national  nacceleration  \
count  3.850000e+02  3.850000e+02  385.000000  3.850000e+02   3.850000e+02   
mean   5.536697e-17  3.691131e-17    0.000000  5.536697e-17   8.858715e-16   
std    1.001301e+00  1.001301e+00    1.001301  1.001301e+00   1.001301e+00   
min   -1.322876e+00 -4.506939e-01   -0.491869 -1.322876e+00  -3.137143e+00   
25%   -1.322876e+00 -4.506939e-01   -0.491869 -1.322876e+00  -5.666576e-01   
50%    7.559289e-01 -4.506939e-01   -0.491869  7.559289e-01   2.960915e-02   
75%    7.559289e-01 -4.506939e-01   -0.491869  7.559289e-01   5.612259e-01   
max    7.559289e-01  2.218801e+00    2.033060  7.559289e-01   3.008909e+00   

          car_dhw  carperformance           mpg  carperformance_sq  \
count  385.000000    3.850000e+02  3.850000e+02       3.850000e+02   
mean     0.000000    1.845566e-17  7.382262e-17       2.214679e-16   
std      1.001301    1.001301e+00  1.001301e+00       1.001301e+00   
min     -1.403983   -1.555483e+00 -1.845657e+00      -9.779589e-01   
25%     -0.840136   -8.266631e-01 -8.235186e-01      -8.048176e-01   
50%     -0.301297   -2.686098e-01 -5.691454e-02      -2.601489e-01   
75%      0.801717    7.960013e-01  7.096895e-01       4.408574e-01   
max      2.720384    2.460569e+00  2.958395e+00       4.660061e+00   

       standcarperformance_sq  acceleration_sq       dhw_abs  
count            3.850000e+02     3.850000e+02  3.850000e+02  
mean             1.845566e-17    -2.952905e-16  1.476452e-16  
std              1.001301e+00     1.001301e+00  1.001301e+00  
min             -9.779589e-01    -2.093184e+00 -1.541734e+00  
25%             -8.048176e-01    -6.319763e-01 -8.220352e-01  
50%             -2.601489e-01    -1.000113e-01 -8.615137e-03  
75%              4.408574e-01     4.512916e-01  5.509704e-01  
max              4.660061e+00     4.138406e+00  3.461176e+00



In [109]:

    
# Normalize the data so that all variables have a mean of 0 and standard deviation
# of 1.
X = StandardScaler().fit_transform(df_scaled)
# The NumPy covariance function assumes that variables are represented by rows,
# not columns, so we transpose X.
Xt = X.T
Cx = np.cov(Xt)
print('Covariance Matrix:\n', Cx)









    



Covariance Matrix:
 [[ 1.00260417 -0.59776464 -0.65237651  1.00260417 -0.27023196  0.60341904
   0.61770242 -0.57819656  0.04366441  0.04366441 -0.25396748  0.04132951]
 [-0.59776464  1.00260417 -0.22225983 -0.59776464  0.18693165 -0.33455249
  -0.341136    0.24065758 -0.0594418  -0.0594418   0.20234152 -0.06105525]
 [-0.65237651 -0.22225983  1.00260417 -0.65237651  0.15141702 -0.4164817
  -0.42760341  0.47465638  0.00318768  0.00318768  0.11708673  0.00754974]
 [ 1.00260417 -0.59776464 -0.65237651  1.00260417 -0.27023196  0.60341904
   0.61770242 -0.57819656  0.04366441  0.04366441 -0.25396748  0.04132951]
 [-0.27023196  0.18693165  0.15141702 -0.27023196  1.00260417 -0.59937756
  -0.59181339  0.42808296 -0.27649744 -0.27649744  0.98244019 -0.40198289]
 [ 0.60341904 -0.33455249 -0.4164817   0.60341904 -0.59937756  1.00260417
   0.99797643 -0.84365253  0.45712344  0.45712344 -0.54005189  0.46048174]
 [ 0.61770242 -0.341136   -0.42760341  0.61770242 -0.59181339  0.99797643
   1.00260417 -0.85559102  0.40099996  0.40099996 -0.54562291  0.40453462]
 [-0.57819656  0.24065758  0.47465638 -0.57819656  0.42808296 -0.84365253
  -0.85559102  1.00260417 -0.18361233 -0.18361233  0.40022221 -0.17346323]
 [ 0.04366441 -0.0594418   0.00318768  0.04366441 -0.27649744  0.45712344
   0.40099996 -0.18361233  1.00260417  1.00260417 -0.2012949   0.95814797]
 [ 0.04366441 -0.0594418   0.00318768  0.04366441 -0.27649744  0.45712344
   0.40099996 -0.18361233  1.00260417  1.00260417 -0.2012949   0.95814797]
 [-0.25396748  0.20234152  0.11708673 -0.25396748  0.98244019 -0.54005189
  -0.54562291  0.40022221 -0.2012949  -0.2012949   1.00260417 -0.32683974]
 [ 0.04132951 -0.06105525  0.00754974  0.04132951 -0.40198289  0.46048174
   0.40453462 -0.17346323  0.95814797  0.95814797 -0.32683974  1.00260417]]



In [111]:

    
# Calculating eigenvalues and eigenvectors.
eig_val_cov, eig_vec_cov = np.linalg.eig(Cx)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_val_cov)):
    eigvec_cov = eig_vec_cov[:, i].reshape(1, 12).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvec_cov))
    print('Eigenvalue {}: {}'.format(i + 1, eig_val_cov[i]))
    print(40 * '-')

print(
    'The percentage of total variance in the dataset explained by each',
    'component calculated by hand.\n',
    eig_val_cov / sum(eig_val_cov)
)









    



Eigenvector 1: 
[[-0.30662858]
 [ 0.17946324]
 [ 0.20268855]
 [-0.30662858]
 [ 0.29081385]
 [-0.39893229]
 [-0.39484668]
 [ 0.33353444]
 [-0.22377088]
 [-0.22377088]
 [ 0.26980338]
 [-0.23421927]]
Eigenvalue 1: 5.570779082093615
----------------------------------------
Eigenvector 2: 
[[ 0.33509103]
 [-0.15420272]
 [-0.26115178]
 [ 0.33509103]
 [ 0.08352041]
 [ 0.00800076]
 [ 0.04094317]
 [-0.15193513]
 [-0.46491707]
 [-0.46491707]
 [ 0.05500404]
 [-0.46511047]]
Eigenvalue 2: 2.8531884464049573
----------------------------------------
Eigenvector 3: 
[[ 0.23386915]
 [-0.0405149 ]
 [-0.24573814]
 [ 0.23386915]
 [ 0.55970159]
 [-0.01456539]
 [-0.03259331]
 [ 0.01234669]
 [ 0.25529005]
 [ 0.25529005]
 [ 0.6026557 ]
 [ 0.14265012]]
Eigenvalue 3: 1.4594201394417539
----------------------------------------
Eigenvector 4: 
[[-0.1383916 ]
 [ 0.77355452]
 [-0.56357171]
 [-0.1383916 ]
 [-0.03075099]
 [ 0.07142987]
 [ 0.07429374]
 [-0.17895194]
 [-0.02670961]
 [-0.02670961]
 [-0.00368846]
 [-0.0257731 ]]
Eigenvalue 4: 1.256829776139853
----------------------------------------
Eigenvector 5: 
[[-0.28011632]
 [-0.02505573]
 [ 0.36392994]
 [-0.28011632]
 [ 0.25531687]
 [ 0.34040447]
 [ 0.35827853]
 [-0.54072874]
 [-0.07080522]
 [-0.07080522]
 [ 0.26516523]
 [-0.1668669 ]]
Eigenvalue 5: 0.6932403158111243
----------------------------------------
Eigenvector 6: 
[[-0.03532624]
 [ 0.05636706]
 [-0.01040705]
 [-0.03532624]
 [ 0.022553  ]
 [ 0.44896767]
 [ 0.47049792]
 [ 0.73442505]
 [-0.06898282]
 [-0.06898282]
 [ 0.10929538]
 [-0.0996332 ]]
Eigenvalue 6: 0.1368409246938686
----------------------------------------
Eigenvector 7: 
[[-0.00354692]
 [-0.00576639]
 [ 0.00976223]
 [-0.00354692]
 [-0.02551176]
 [ 0.0522413 ]
 [ 0.00473941]
 [-0.02486686]
 [-0.39544177]
 [-0.39544177]
 [ 0.17594593]
 [ 0.80753954]]
Eigenvalue 7: 0.04337584707152005
----------------------------------------
Eigenvector 8: 
[[ 0.0021865 ]
 [-0.01594879]
 [ 0.01242935]
 [ 0.0021865 ]
 [-0.69054839]
 [ 0.18359978]
 [-0.25560663]
 [-0.03092752]
 [ 0.03850664]
 [ 0.03850664]
 [ 0.63392223]
 [-0.1337963 ]]
Eigenvalue 8: 0.01645202601765204
----------------------------------------
Eigenvector 9: 
[[-0.00103217]
 [ 0.00494827]
 [-0.00342662]
 [-0.00103217]
 [ 0.22591343]
 [ 0.69396645]
 [-0.64865157]
 [ 0.01231522]
 [-0.01622762]
 [-0.01622762]
 [-0.21424459]
 [-0.00271668]]
Eigenvalue 9: 0.0011234423256617537
----------------------------------------
Eigenvector 10: 
[[  2.05381160e-01]
 [ -1.24943396e-01]
 [ -1.32097050e-01]
 [ -3.65827088e-01]
 [  1.21868601e-14]
 [  3.50071841e-14]
 [ -3.29051164e-14]
 [  6.00410446e-16]
 [  6.28859538e-01]
 [ -6.28859538e-01]
 [ -1.17051194e-14]
 [ -3.63665045e-16]]
Eigenvalue 10: 8.271507446402408e-18
----------------------------------------
Eigenvector 11: 
[[ -7.83428975e-01]
 [ -2.78993342e-01]
 [ -2.94967149e-01]
 [  4.25159975e-01]
 [ -6.97922917e-14]
 [ -2.30970655e-13]
 [  2.15414037e-13]
 [ -5.09610064e-15]
 [ -1.42539701e-01]
 [  1.42539701e-01]
 [  6.61319907e-14]
 [  1.78466456e-15]]
Eigenvalue 11: -1.2147733617760625e-16
----------------------------------------
Eigenvector 12: 
[[  3.60621469e-01]
 [ -2.31299455e-01]
 [ -2.44542541e-01]
 [ -6.57644415e-01]
 [ -2.40989789e-15]
 [ -2.16804331e-14]
 [  2.08557435e-14]
 [ -1.01841774e-16]
 [ -4.02588802e-01]
 [  4.02588802e-01]
 [  2.61435782e-15]
 [  2.17289611e-15]]
Eigenvalue 12: -7.74424798196182e-17
----------------------------------------
The percentage of total variance in the dataset explained by each component calculated by hand.
 [  4.63025794e-01   2.37148131e-01   1.21302453e-01   1.04463774e-01
   5.76199743e-02   1.13737911e-02   3.60526521e-03   1.36744112e-03
   9.33770245e-05   6.87501918e-19  -1.00968176e-17  -6.43677754e-18]



In [112]:

    
#From the Scree plot we should use onle the first 2 components that will explain 46% and 23% each
plt.plot(eig_val_cov)
plt.show()



In [114]:

    
# Create P, which we will use to transform Cx into Cy to get Y, the
# dimensionally-reduced representation of X.
P = eig_vec_cov[:, 0]

# Transform X into Y.
Y = P.T.dot(Xt)

# Combine X and Y for plotting purposes.
data_to_plot = df_scaled[['nacceleration','car_dhw','carperformance','carperformance_sq','standcarperformance_sq','acceleration_sq','dhw_abs']]
data_to_plot['Component'] = Y
data_to_plot = pd.melt(data_to_plot, id_vars='Component')

g = sns.FacetGrid(data_to_plot, col="variable", size=4, aspect=.5)
g = g.map(
    sns.regplot,
    "Component",
    "value",
    x_jitter=.49,
    y_jitter=.49,
    fit_reg=False
)
plt.show()









    



d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.



In [121]:

    
sklearn_pca = PCA(n_components=5)
Y_sklearn = sklearn_pca.fit_transform(X)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)

# Compare the sklearn solution to ours – a perfect match.
plt.plot(Y_sklearn[:, 0], Y, 'o')
plt.title('Comparing solutions')
plt.ylabel('Sklearn Component 1')
plt.xlabel('By-hand Component 1')
plt.show()









    



The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [ 0.46302579  0.23714813  0.12130245  0.10446377  0.05761997]



In [ ]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	modelyear	origin	carname
0	18.0	8	307.0	130.0	3504.0	12.0	70	1	"chevrolet chevelle malibu"
1	15.0	8	350.0	165.0	3693.0	11.5	70	1	"buick skylark 320"
2	18.0	8	318.0	150.0	3436.0	11.0	70	1	"plymouth satellite"
3	16.0	8	304.0	150.0	3433.0	12.0	70	1	"amc rebel sst"
4	17.0	8	302.0	140.0	3449.0	10.5	70	1	"ford torino"

	cylinders	3.0	4.0	5.0	6.0	8.0
acceleration	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	13.250000	16.581910	18.633333	16.254217	12.955340
	std	0.500000	2.383185	2.369247	2.031778	2.224759
	min	12.500000	11.600000	15.900000	11.300000	8.000000
	25%	13.250000	14.800000	17.900000	15.050000	11.500000
	50%	13.500000	16.200000	19.900000	16.000000	13.000000
	75%	13.500000	18.000000	20.000000	17.600000	14.000000
	max	13.500000	24.800000	20.100000	21.000000	22.200000
displacement	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	72.500000	109.670854	145.000000	218.361446	345.009709
	std	5.000000	21.376813	33.286634	32.427246	46.776376
	min	70.000000	68.000000	121.000000	145.000000	260.000000
	25%	70.000000	91.000000	126.000000	199.500000	305.000000
	50%	70.000000	105.000000	131.000000	231.000000	350.000000
	75%	72.500000	121.000000	157.000000	250.000000	360.000000
	max	80.000000	156.000000	183.000000	262.000000	455.000000
horsepower	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	99.250000	78.281407	82.333333	101.506024	158.300971
	std	8.301606	14.523099	18.583146	14.310472	28.453552
	min	90.000000	46.000000	67.000000	72.000000	90.000000
	25%	95.250000	68.000000	72.000000	92.500000	140.000000
	50%	98.500000	78.000000	77.000000	100.000000	150.000000
	75%	102.500000	88.000000	90.000000	110.000000	175.000000
	max	110.000000	115.000000	103.000000	165.000000	230.000000
modelyear	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	75.500000	77.030151	79.000000	75.951807	73.902913
	std	3.696846	3.737484	1.000000	3.264381	3.021214
	min	72.000000	70.000000	78.000000	70.000000	70.000000
	25%	72.750000	74.000000	78.500000	74.000000	72.000000
	50%	75.000000	77.000000	79.000000	76.000000	73.000000
	75%	77.750000	80.000000	79.500000	78.000000	76.000000
	max	80.000000	82.000000	80.000000	82.000000	81.000000
mpg	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	20.550000	29.283920	27.366667	19.973494	14.963107
	std	2.564501	5.670546	8.228204	3.828809	2.836284
	min	18.000000	18.000000	20.300000	15.000000	9.000000
	25%	18.750000	25.000000	22.850000	18.000000	13.000000
	50%	20.250000	28.400000	25.400000	19.000000	14.000000
	75%	22.050000	32.950000	30.900000	21.000000	16.000000
	max	23.700000	46.600000	36.400000	38.000000	26.600000
origin	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	3.000000	2.000000	2.000000	1.192771	1.000000
	std	0.000000	0.834847	0.000000	0.551225	0.000000
	min	3.000000	1.000000	2.000000	1.000000	1.000000
	25%	3.000000	1.000000	2.000000	1.000000	1.000000
	50%	3.000000	2.000000	2.000000	1.000000	1.000000
	75%	3.000000	3.000000	2.000000	1.000000	1.000000
	max	3.000000	3.000000	2.000000	3.000000	1.000000
weight	count	4.000000	199.000000	3.000000	83.000000	103.000000
	mean	2398.500000	2305.110553	3103.333333	3202.120482	4114.718447
	std	247.566153	342.872223	374.343870	332.383425	448.833159
	min	2124.000000	1613.000000	2830.000000	2472.000000	3086.000000
	25%	2278.500000	2047.500000	2890.000000	2945.000000	3799.000000
	50%	2375.000000	2230.000000	2950.000000	3210.000000	4140.000000
	75%	2495.000000	2562.500000	3240.000000	3431.000000	4403.500000
	max	2720.000000	3270.000000	3530.000000	3907.000000	5140.000000