Exercise 02
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
In [51]:
college_file_path = '../data/College.csv'
colleges = pd.read_csv(college_file_path, index_col=0)
colleges.describe()
Out[51]:
In [52]:
from pandas.tools.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(colleges[['Apps','Accept','Enroll','Top10perc','Top25perc',
'F.Undergrad','P.Undergrad','Outstate','Room.Board','Books']],
alpha=0.2,diagonal='kde', ax=ax);
In [53]:
df_Outstate_Private = colleges[['Outstate','Private']]
df_Outstate_Private.boxplot(by='Private')
Out[53]:
In [54]:
colleges['Elite'] = colleges['Top10perc'] / colleges['Enroll']
indices_yes = colleges['Elite']>0.5
indices_no = colleges['Elite']<=0.5
colleges['Elite'][indices_yes]='Yes'
colleges['Elite'][indices_no]='No'
In [55]:
elite_groups = colleges.groupby('Elite')
print('non-elite universities count is :',len(elite_groups.groups['No']))
print('elite univeristies count is: ', len(elite_groups.groups['Yes']))
In [56]:
df_Outstate_Elite = colleges[['Outstate', 'Elite']]
df_Outstate_Elite.boxplot(by='Elite')
Out[56]:
In [57]:
df_hist = colleges[['Apps','Accept','Enroll']]
df_hist.plot(kind='hist', alpha=0.6)
Out[57]:
In [4]:
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+')
autos.head()
Out[4]:
In [71]:
autos.describe()
Out[71]:
Refinement the Auto data
In [99]:
autos=autos.replace('?',np.NAN).dropna()
autos['horsepower']=autos['horsepower'].astype('float')
autos.describe()
Out[99]:
In [100]:
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(autos[['mpg','cylinders','displacement','horsepower','weight','acceleration']],alpha=0.5, diagonal='kde',ax=ax);
In [3]:
boston_file_path = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_path, index_col=0)
bostons.head()
Out[3]:
In [102]:
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(bostons, alpha=0.5, diagonal='kde',ax=ax);
From above scatter matrix, we choose dis
,black
,lstat
,and medv
variables to compare the crim
variables.
In [103]:
boston_sub = bostons[['crim','dis','black','lstat','medv']]
fig, ax = plt.subplots(figsize=(20, 20))
scatter_matrix(boston_sub, alpha=0.5, diagonal='kde',ax=ax);
In [104]:
bostons.describe()
Out[104]: