Instructor: Jeff Lockhart
Date: 11/11/2017, 8:30 - 9:30 AM
Materials online at: github.com/jwlockhart/data_workshops
pandas
is a package of tools for working with data.pd
to refer to pandas
. Programmers often do this so that we can type less. matplotlib
is a package for making charts and graphs, and here we're going to use the pyplot
part of it and abbreviate that as plt
%matplotlib inline
is what Jupyter Notebooks call "magic." It tells the notebook to show us the graphs in the notebook rather than saving them as files or having them pop up.
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
gss = pd.read_csv('gss.csv')
In [ ]:
print("The GSS data has", gss.shape[0], "rows and", gss.shape[1], "columns.")
In [ ]:
gss.shape
In [ ]:
gss.columns
In [ ]:
gss.head()
In [ ]:
gss.head(10)
In [ ]:
gss.tail(3)
In [ ]:
gss['age'].head()
In [ ]:
gss[['age', 'education.num', 'education']].head()
In [ ]:
gss['age'].mean()
In [ ]:
gss['age'].describe()
In [ ]:
gss['age'].describe().round(2)
In [ ]:
gss['age'].hist()
In [ ]:
gss['age'].hist(bins=20)
In [ ]:
gss['education'].value_counts()
In [ ]:
gss.head()
In [ ]:
gss = gss.replace('?', np.nan)
gss.head()
In [ ]:
gss = pd.read_csv('gss.csv', na_values=['?'])
gss.head()
In [ ]:
tmp = gss.dropna(subset=['occupation'])
print(gss.shape)
print(tmp.shape)
tmp.head()
In [ ]:
gss.sort_values(by='age').head()
In [ ]:
gss.head()
In [ ]:
gss = gss.sort_values(by=['age', 'education.num'])
gss.head()
In [ ]:
white_18 = gss[gss['race'] == 'White']
white_18 = white_18[white_18['age'] == 18]
white_18.head()
In [ ]:
white_18.shape
In [ ]:
white_18 = gss[(gss['race'] == 'White') & (gss['age'] == 18)]
white_18.head()
In [ ]:
gss['years_out'] = gss['age'] - gss['education.num']
gss.head()
In [ ]:
gss[['age', 'education.num', 'years_out', 'education']].head(10)
In [ ]:
gss.years_out.describe()
In [ ]:
gss.years_out.min()
In [ ]:
gss[gss.years_out == 4]
In [ ]:
gss['new_years'] = gss['education.num'] + 4
In [ ]:
gss['new_years2'] = gss['education.num'].apply(lambda x: x+4)
In [ ]:
def add4(x):
tmp = x+4
return tmp
gss['new_years3'] = gss['education.num'].apply(add4)
In [ ]:
def add4(row):
tmp = row['education.num'] + 4
return tmp
gss['new_years3'] = gss.apply(add4, axis=1)
In [ ]:
gss.head()
In [ ]:
gss.occupation.value_counts()
In [ ]:
def is_office(job):
if job == 'Prof-specialty':
office = True
elif job == 'Exec-managerial':
office = True
elif job == 'Adm-clerical':
office = True
#more here...
else:
office = False
return office
gss['office_job'] = gss['occupation'].apply(is_office)
gss[['occupation', 'office_job']].head(10)
In [ ]:
def is_office(job):
office = False
office_jobs = ['Prof-specialty', 'Exec-managerial', 'Adm-clerical',
'Sales', 'Tech-support']
for o in office_jobs:
if job == o:
office= True
return office
gss['office_job'] = gss['occupation'].apply(is_office)
gss[['occupation', 'office_job']].head(10)
In [ ]:
def is_office(job):
office = False
office_jobs = ['Prof-specialty', 'Exec-managerial', 'Adm-clerical',
'Sales', 'Tech-support']
if job in office_jobs:
office = True
return office
gss['office_job'] = gss['occupation'].apply(is_office)
gss[['occupation', 'office_job']].head(10)
In [ ]:
by_age = gss.groupby('age').mean()
by_age.head()
In [ ]:
by_age['hours.per.week'].plot.line()
In [ ]:
by_age = gss.groupby('age')
hours = pd.DataFrame()
hours['mean'] = by_age['hours.per.week'].mean()
hours['error'] = by_age['hours.per.week'].sem()
hours.head()
In [ ]:
hours.plot.line(y='mean', yerr='error',
title='Average hours worked by age, with error.')
In [ ]:
by_age_sex = gss.groupby(['sex', 'age'])
hours = pd.DataFrame()
hours['mean'] = by_age_sex['hours.per.week'].mean()
hours['error'] = by_age_sex['hours.per.week'].sem()
hours.head()
In [ ]:
hours = hours.unstack(level=0)
hours.head()
In [ ]:
hours.plot.line(y='mean', #yerr='error',
title='Average hours worked by age and sex')
In [ ]:
gss.boxplot(column='hours.per.week', by=['sex', 'office_job'], figsize=(8,8))
In [ ]:
by_job = gss.groupby(['occupation'])
hours = pd.DataFrame()
hours['mean'] = by_job['hours.per.week'].mean()
hours['error'] = by_job['hours.per.week'].sem()
hours
In [ ]:
hours = hours.sort_values(by='mean')
hours.plot.barh(y='mean', xerr='error')
In [ ]: