In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
Figure 2.1
In [5]:
data = pd.read_csv('data/Advertising.csv')
data.head()
Out[5]:
In [6]:
plt.figure(figsize=(18, 6))
plt.subplot(131)
sns.regplot(x='TV', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 310)
plt.subplot(132)
sns.regplot(x='Radio', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 55)
plt.subplot(133)
sns.regplot(x='Newspaper', y='Sales', data=data, scatter_kws={'color': 'red'})
plt.xlim(-5, 110)
Out[6]:
In [7]:
college = pd.read_csv('data/College.csv')
college.head()
Out[7]:
In [8]:
college.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
college.head(2)
Out[8]:
In [9]:
college.describe()
Out[9]:
In [10]:
sns.pairplot(college.iloc[:, 2:11])
Out[10]:
In [11]:
sns.boxplot(x='Private', y='Outstate', data=college)
Out[11]:
In [12]:
college.loc[college.loc[:, 'Top10perc'] > 50, 'Elite'] = 'Yes'
college.loc[college.loc[:, 'Top10perc'] <= 50, 'Elite'] = 'No'
print(college.Elite.value_counts())
sns.boxplot(x='Elite', y='Outstate', data=college)
Out[12]:
In [13]:
plt.figure(figsize=(10,10))
plt.subplot(221)
sns.distplot(college['Apps'], kde=False, bins=20)
plt.subplot(222)
sns.distplot(college['Apps'], kde=False, bins=100)
plt.subplot(223)
sns.distplot(college['Outstate'], kde=False, bins=20)
plt.subplot(224)
sns.distplot(college['Outstate'], kde=False, bins=100)
Out[13]:
In [14]:
auto = pd.read_csv('data/Auto.csv')
auto.head()
Out[14]:
In [15]:
auto.describe()
Out[15]:
In [17]:
ss1 = auto[:10]
ss2 = auto[85:]
subset = pd.concat([ss1, ss2])
subset.describe()
Out[17]:
In [18]:
sns.pairplot(auto.iloc[:, :8])
Out[18]:
In [22]:
plt.figure(figsize=(12,6))
plt.subplot(121)
sns.boxplot(x='cylinders', y='mpg', data=auto)
plt.subplot(122)
sns.regplot(x='weight', y='mpg', data=auto, scatter_kws={'color': 'red'})
Out[22]:
In [24]:
boston = pd.read_csv('data/Boston.csv')
boston.describe()
Out[24]:
In [25]:
sns.pairplot(boston[1:])
Out[25]:
In [27]:
boston.chas.sum() # value is 1 if next to Charles otherwise value is zero so sum is number next to Charles
Out[27]:
In [28]:
boston.ptratio.median()
Out[28]:
In [30]:
boston[boston.medv == boston.medv.min()]
Out[30]:
In [31]:
boston[boston.rm >= 7].describe()
Out[31]:
In [32]:
boston[boston.rm >= 8].describe()
Out[32]:
In [ ]: