In [ ]:
"""
----------------------------------------------------------------------
Filename : 03_regression_in_pandas_sm.py
Date : 12th Dec, 2013
Author : Jaidev Deshpande
Purpose : Introduction to linear regression through Pandas and Statsmodels
Libraries: NumPy, Pandas, Matplotlib
----------------------------------------------------------------------
"""
In [ ]:
# standard library imports
from os.path import join
# system library imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [ ]:
# Files to load during this part.
longevity = join('data','longevity.csv')
heights_weights = join('data','01_heights_weights_genders.csv')
pageviews = join('data','top_1000_sites.tsv')
In [ ]:
# DataFrames of the above datasets
longevity = pd.read_csv(longevity)
hwg = pd.read_csv(heights_weights)
# Columns to consider for the pageviews dataset
pageview_columns = ['Rank','UniqueVisitors','PageViews','HasAdvertising',
'InEnglish']
pageviews = pd.read_csv(pageviews, sep='\t', usecols=pageview_columns)
In [ ]:
longevity.head()
In [ ]:
def plot_kde_smoking(df):
"""
Draw the KDE plot of the longevity dataset
"""
smokers = df[df['Smokes']==1]['AgeAtDeath']
non_smokers = df[df['Smokes']==0]['AgeAtDeath']
plt.figure()
fig, axes = plt.subplots(nrows=2, ncols=1)
smokers.plot(ax=axes[0],kind='kde',figsize=(20,10))
axes[0].set_xbound(30,110)
axes[0].set_title('Smokers')
axes[0].set_xlabel('Age')
non_smokers.plot(ax=axes[1],kind='kde',sharex=axes[0], figsize=(20,10))
axes[1].set_xbound(30,110)
axes[1].set_title('Non-Smokers')
axes[1].set_xlabel('Age')
plt.show()
In [ ]:
plot_kde_smoking(longevity)
In [ ]:
def mean_squared_error(df,show_plot=True):
"""
Get the mean squared error obtained when fitting a straight line
through the longevity data.
"""
age_at_death = df['AgeAtDeath']
unique_ages = age_at_death.unique()
unique_ages.sort()
agediff = np.zeros((age_at_death.shape[0],unique_ages.shape[0]),
dtype=float)
for i in range(agediff.shape[1]):
agediff[:,i] = (age_at_death - unique_ages[i])**2
mse_age = agediff.mean(0)
if show_plot:
plt.figure(figsize=(20,10))
plt.plot(unique_ages, mse_age,'r.', markersize=12)
plt.plot(unique_ages, mse_age, 'b--')
plt.grid()
plt.xlabel('Estimate of Age')
plt.ylabel('Mean Squared Error')
plt.show()
return mse_age
def root_mean_squared_error(df,show_plot=False):
return np.sqrt(mean_squared_error(df,show_plot=show_plot))
In [ ]:
mse = mean_squared_error(longevity)
print(mse.min())
rmse = root_mean_squared_error(longevity)
print(rmse.min())
In [ ]:
def plot_ols_fitted(df, xcol, ycol):
"""
Fit an OLS model with the data contained in `df`,
where `xcol` and `ycol` specify the columns to consider
as the `x` and `y` axes respectively.
"""
df_ols = pd.ols(x=df[xcol],y=df[ycol])
df['estimate'] = df_ols.y_fitted
fig = plt.figure(figsize=(20,10))
axes = fig.add_subplot(111)
pd.tools.plotting.scatter_plot(df, xcol, ycol, ax=axes)
df.plot(xcol,'estimate',c='g',ax=axes,lw=5)
plt.show()
return df_ols
In [ ]:
plot_ols_fitted(hwg, 'Height', 'Weight')
In [ ]:
def compare_sites_by_advertising(df):
"""
Prints some vital statistics of the pageviews data.
"""
advertised = df[df['HasAdvertising']=='Yes']
nonadvert = df[df['HasAdvertising']=='No']
print("Sites with advertising:")
print("=" * 80)
print("Highest Rank" + 2*"\t", advertised['Rank'].min())
print("Mean PageViews" + 2*"\t", advertised['PageViews'].mean())
print("Mean UniqueVisitors" + "\t", advertised['UniqueVisitors'].mean())
print("\n")
print("Sites w/o advertising:")
print("=" * 80)
print("Highest Rank" + 2*"\t", nonadvert['Rank'].min())
print("Mean PageViews" + 2*"\t", nonadvert['PageViews'].mean())
print("Mean UniqueVisitors" + "\t", nonadvert['UniqueVisitors'].mean())
In [ ]:
compare_sites_by_advertising(pageviews)
In [ ]:
pageviews.columns
In [ ]:
pageviews['UniqueVisitors'] = pageviews['UniqueVisitors'].apply(np.log10)
pageviews['PageViews'] = pageviews['PageViews'].apply(np.log10)
In [ ]:
fig = plt.figure(figsize=(20,10))
axes = fig.add_subplot(111)
pd.tools.plotting.scatter_plot(pageviews, 'UniqueVisitors', 'PageViews', ax=axes, grid=True)
pols = pd.ols(x=pageviews['UniqueVisitors'],y=pageviews['PageViews'])
pageviews['estimate'] = pols.y_fitted
pageviews.plot('UniqueVisitors','estimate',c='g',lw=5, ax=axes)
plt.show()