Exercise 02



In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

1 College Data set



In [51]:

    
college_file_path = '../data/College.csv'
colleges = pd.read_csv(college_file_path, index_col=0)
colleges.describe()









    Out[51]:






  
    
      
      Apps
      Accept
      Enroll
      Top10perc
      Top25perc
      F.Undergrad
      P.Undergrad
      Outstate
      Room.Board
      Books
      Personal
      PhD
      Terminal
      S.F.Ratio
      perc.alumni
      Expend
      Grad.Rate
    
  
  
    
      count
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.000000
      777.00000
    
    
      mean
      3001.638353
      2018.804376
      779.972973
      27.558559
      55.796654
      3699.907336
      855.298584
      10440.669241
      4357.526384
      549.380952
      1340.642214
      72.660232
      79.702703
      14.089704
      22.743887
      9660.171171
      65.46332
    
    
      std
      3870.201484
      2451.113971
      929.176190
      17.640364
      19.804778
      4850.420531
      1522.431887
      4023.016484
      1096.696416
      165.105360
      677.071454
      16.328155
      14.722359
      3.958349
      12.391801
      5221.768440
      17.17771
    
    
      min
      81.000000
      72.000000
      35.000000
      1.000000
      9.000000
      139.000000
      1.000000
      2340.000000
      1780.000000
      96.000000
      250.000000
      8.000000
      24.000000
      2.500000
      0.000000
      3186.000000
      10.00000
    
    
      25%
      776.000000
      604.000000
      242.000000
      15.000000
      41.000000
      992.000000
      95.000000
      7320.000000
      3597.000000
      470.000000
      850.000000
      62.000000
      71.000000
      11.500000
      13.000000
      6751.000000
      53.00000
    
    
      50%
      1558.000000
      1110.000000
      434.000000
      23.000000
      54.000000
      1707.000000
      353.000000
      9990.000000
      4200.000000
      500.000000
      1200.000000
      75.000000
      82.000000
      13.600000
      21.000000
      8377.000000
      65.00000
    
    
      75%
      3624.000000
      2424.000000
      902.000000
      35.000000
      69.000000
      4005.000000
      967.000000
      12925.000000
      5050.000000
      600.000000
      1700.000000
      85.000000
      92.000000
      16.500000
      31.000000
      10830.000000
      78.00000
    
    
      max
      48094.000000
      26330.000000
      6392.000000
      96.000000
      100.000000
      31643.000000
      21836.000000
      21700.000000
      8124.000000
      2340.000000
      6800.000000
      103.000000
      100.000000
      39.800000
      64.000000
      56233.000000
      118.00000



In [52]:

    
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(colleges[['Apps','Accept','Enroll','Top10perc','Top25perc',
                         'F.Undergrad','P.Undergrad','Outstate','Room.Board','Books']], 
               alpha=0.2,diagonal='kde', ax=ax);









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)



In [53]:

    
df_Outstate_Private = colleges[['Outstate','Private']]
df_Outstate_Private.boxplot(by='Private')









    Out[53]:





<matplotlib.axes._subplots.AxesSubplot at 0x11aa9d518>



In [54]:

    
colleges['Elite'] = colleges['Top10perc'] / colleges['Enroll']
indices_yes = colleges['Elite']>0.5
indices_no = colleges['Elite']<=0.5
colleges['Elite'][indices_yes]='Yes'
colleges['Elite'][indices_no]='No'









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/gaufung/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [55]:

    
elite_groups = colleges.groupby('Elite')
print('non-elite universities count is :',len(elite_groups.groups['No']))
print('elite univeristies count is: ', len(elite_groups.groups['Yes']))









    



non-elite universities count is : 775
elite univeristies count is:  2



In [56]:

    
df_Outstate_Elite = colleges[['Outstate', 'Elite']]
df_Outstate_Elite.boxplot(by='Elite')









    Out[56]:





<matplotlib.axes._subplots.AxesSubplot at 0x120b05978>



In [57]:

    
df_hist = colleges[['Apps','Accept','Enroll']]
df_hist.plot(kind='hist', alpha=0.6)









    Out[57]:





<matplotlib.axes._subplots.AxesSubplot at 0x11cc134a8>

2 Auto Data set



In [4]:

    
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+')
autos.head()









    Out[4]:






  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      year
      origin
      name
    
  
  
    
      0
      18.0
      8
      307.0
      130.0
      3504.0
      12.0
      70
      1
      chevrolet chevelle malibu
    
    
      1
      15.0
      8
      350.0
      165.0
      3693.0
      11.5
      70
      1
      buick skylark 320
    
    
      2
      18.0
      8
      318.0
      150.0
      3436.0
      11.0
      70
      1
      plymouth satellite
    
    
      3
      16.0
      8
      304.0
      150.0
      3433.0
      12.0
      70
      1
      amc rebel sst
    
    
      4
      17.0
      8
      302.0
      140.0
      3449.0
      10.5
      70
      1
      ford torino



In [71]:

    
autos.describe()









    Out[71]:






  
    
      
      mpg
      cylinders
      displacement
      weight
      acceleration
      year
      origin
    
  
  
    
      count
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
      397.000000
    
    
      mean
      23.515869
      5.458438
      193.532746
      2970.261965
      15.555668
      75.994962
      1.574307
    
    
      std
      7.825804
      1.701577
      104.379583
      847.904119
      2.749995
      3.690005
      0.802549
    
    
      min
      9.000000
      3.000000
      68.000000
      1613.000000
      8.000000
      70.000000
      1.000000
    
    
      25%
      17.500000
      4.000000
      104.000000
      2223.000000
      13.800000
      73.000000
      1.000000
    
    
      50%
      23.000000
      4.000000
      146.000000
      2800.000000
      15.500000
      76.000000
      1.000000
    
    
      75%
      29.000000
      8.000000
      262.000000
      3609.000000
      17.100000
      79.000000
      2.000000
    
    
      max
      46.600000
      8.000000
      455.000000
      5140.000000
      24.800000
      82.000000
      3.000000

Refinement the Auto data



In [99]:

    
autos=autos.replace('?',np.NAN).dropna()
autos['horsepower']=autos['horsepower'].astype('float')
autos.describe()









    Out[99]:






  
    
      
      mpg
      cylinders
      displacement
      horsepower
      weight
      acceleration
      year
      origin
    
  
  
    
      count
      392.000000
      392.000000
      392.000000
      392.000000
      392.000000
      392.000000
      392.000000
      392.000000
    
    
      mean
      23.445918
      5.471939
      194.411990
      104.469388
      2977.584184
      15.541327
      75.979592
      1.576531
    
    
      std
      7.805007
      1.705783
      104.644004
      38.491160
      849.402560
      2.758864
      3.683737
      0.805518
    
    
      min
      9.000000
      3.000000
      68.000000
      46.000000
      1613.000000
      8.000000
      70.000000
      1.000000
    
    
      25%
      17.000000
      4.000000
      105.000000
      75.000000
      2225.250000
      13.775000
      73.000000
      1.000000
    
    
      50%
      22.750000
      4.000000
      151.000000
      93.500000
      2803.500000
      15.500000
      76.000000
      1.000000
    
    
      75%
      29.000000
      8.000000
      275.750000
      126.000000
      3614.750000
      17.025000
      79.000000
      2.000000
    
    
      max
      46.600000
      8.000000
      455.000000
      230.000000
      5140.000000
      24.800000
      82.000000
      3.000000



In [100]:

    
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(autos[['mpg','cylinders','displacement','horsepower','weight','acceleration']],alpha=0.5, diagonal='kde',ax=ax);









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)

3 Boston Data set



In [3]:

    
boston_file_path = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_path, index_col=0)
bostons.head()



In [102]:

    
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(bostons, alpha=0.5, diagonal='kde',ax=ax);









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)

From above scatter matrix, we choose dis,black,lstat,and medv variables to compare the crim variables.



In [103]:

    
boston_sub = bostons[['crim','dis','black','lstat','medv']]
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(boston_sub, alpha=0.5, diagonal='kde',ax=ax);









    



/Users/gaufung/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2881: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  exec(code_obj, self.user_global_ns, self.user_ns)



In [104]:

    
bostons.describe()

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
1	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
2	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
3	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
4	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
5	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

	crim	zn	indus	chas	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
count	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000	506.000000
mean	3.613524	11.363636	11.136779	0.069170	0.554695	6.284634	68.574901	3.795043	9.549407	408.237154	18.455534	356.674032	12.653063	22.532806
std	8.601545	23.322453	6.860353	0.253994	0.115878	0.702617	28.148861	2.105710	8.707259	168.537116	2.164946	91.294864	7.141062	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.082045	0.000000	5.190000	0.000000	0.449000	5.885500	45.025000	2.100175	4.000000	279.000000	17.400000	375.377500	6.950000	17.025000
50%	0.256510	0.000000	9.690000	0.000000	0.538000	6.208500	77.500000	3.207450	5.000000	330.000000	19.050000	391.440000	11.360000	21.200000
75%	3.677082	12.500000	18.100000	0.000000	0.624000	6.623500	94.075000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate
count	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.000000	777.00000
mean	3001.638353	2018.804376	779.972973	27.558559	55.796654	3699.907336	855.298584	10440.669241	4357.526384	549.380952	1340.642214	72.660232	79.702703	14.089704	22.743887	9660.171171	65.46332
std	3870.201484	2451.113971	929.176190	17.640364	19.804778	4850.420531	1522.431887	4023.016484	1096.696416	165.105360	677.071454	16.328155	14.722359	3.958349	12.391801	5221.768440	17.17771
min	81.000000	72.000000	35.000000	1.000000	9.000000	139.000000	1.000000	2340.000000	1780.000000	96.000000	250.000000	8.000000	24.000000	2.500000	0.000000	3186.000000	10.00000
25%	776.000000	604.000000	242.000000	15.000000	41.000000	992.000000	95.000000	7320.000000	3597.000000	470.000000	850.000000	62.000000	71.000000	11.500000	13.000000	6751.000000	53.00000
50%	1558.000000	1110.000000	434.000000	23.000000	54.000000	1707.000000	353.000000	9990.000000	4200.000000	500.000000	1200.000000	75.000000	82.000000	13.600000	21.000000	8377.000000	65.00000
75%	3624.000000	2424.000000	902.000000	35.000000	69.000000	4005.000000	967.000000	12925.000000	5050.000000	600.000000	1700.000000	85.000000	92.000000	16.500000	31.000000	10830.000000	78.00000
max	48094.000000	26330.000000	6392.000000	96.000000	100.000000	31643.000000	21836.000000	21700.000000	8124.000000	2340.000000	6800.000000	103.000000	100.000000	39.800000	64.000000	56233.000000	118.00000

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130.0	3504.0	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693.0	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150.0	3436.0	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150.0	3433.0	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140.0	3449.0	10.5	70	1	ford torino

	mpg	cylinders	displacement	weight	acceleration	year	origin
count	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000	397.000000
mean	23.515869	5.458438	193.532746	2970.261965	15.555668	75.994962	1.574307
std	7.825804	1.701577	104.379583	847.904119	2.749995	3.690005	0.802549
min	9.000000	3.000000	68.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.500000	4.000000	104.000000	2223.000000	13.800000	73.000000	1.000000
50%	23.000000	4.000000	146.000000	2800.000000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	262.000000	3609.000000	17.100000	79.000000	2.000000
max	46.600000	8.000000	455.000000	5140.000000	24.800000	82.000000	3.000000

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	23.445918	5.471939	194.411990	104.469388	2977.584184	15.541327	75.979592	1.576531
std	7.805007	1.705783	104.644004	38.491160	849.402560	2.758864	3.683737	0.805518
min	9.000000	3.000000	68.000000	46.000000	1613.000000	8.000000	70.000000	1.000000
25%	17.000000	4.000000	105.000000	75.000000	2225.250000	13.775000	73.000000	1.000000
50%	22.750000	4.000000	151.000000	93.500000	2803.500000	15.500000	76.000000	1.000000
75%	29.000000	8.000000	275.750000	126.000000	3614.750000	17.025000	79.000000	2.000000
max	46.600000	8.000000	455.000000	230.000000	5140.000000	24.800000	82.000000	3.000000