In [1]:
from __future__ import division
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn
import rpy2
from IPython.display import display, Image, YouTubeVideo
%matplotlib inline
Cost -- compare capabilities between software you already use and open source here
Learn more about open science at Amy Boyle's talk Thursday afternoon!
In [3]:
pd.read_csv('
Out[3]:
In [12]:
?pd.read_csv
In [14]:
%quickref
In [15]:
%load_ext rmagic
In [17]:
%R x <- c(0:10, 50)
%R xm <- mean(x)
%R c(xm, mean(x, trim = 0.10))
Out[17]:
In [8]:
YouTubeVideo("L4Hbv4ugUWk")
Out[8]:
Source: IRS.gov
In [22]:
?pd.read_csv()
In [18]:
# read in a CSV
# specify that zipcode should be treated as a string rather than an int!
AGI = pd.read_csv('12zpallagi.csv',dtype={'zipcode': str})
In [19]:
AGI.info()
In [20]:
# you can select columns by label or position!
AGI_column_subset = AGI[['STATE','AGI_STUB','zipcode','N1','A00100']]
In [21]:
AGI_column_subset.info()
In [31]:
# note this is inclusive!
AGI_row_subset = AGI.ix[6:11]
In [32]:
AGI_row_subset
Out[32]:
In [34]:
AGI_column_subset.rename(columns={'N1':'population','A00100':'amount'},inplace=True)
In [35]:
AGI_column_subset
Out[35]:
In [63]:
# group by zipcode and sum other values, resetting index
AGI_grouped = AGI_column_subset.groupby('zipcode').sum().reset_index()
In [64]:
AGI_grouped.head()
Out[64]:
In [65]:
AGI_grouped['population'].mean()
Out[65]:
In [66]:
#this can also be done using the na_values param upon being read in
null_zips = AGI_grouped['zipcode'] == '00000'
AGI_grouped.loc[null_zips, 'zipcode'] = np.nan
In [67]:
AGI_grouped.head()
Out[67]:
In [68]:
AGI_notnull = AGI_grouped.dropna()
In [69]:
AGI_notnull['population'].mean()
Out[69]:
In [70]:
AGI_grouped.dropna(inplace=True)
In [73]:
# make a new column with the real amount, not in thousands
AGI_grouped['actual_amount'] = AGI_grouped['amount'] * 1000
In [74]:
# make a mean!
AGI_grouped['weighted_mean_AGI'] = AGI_grouped['actual_amount']/AGI_grouped['population']
In [75]:
#use anonymous functions to change every value in a column!
AGI_grouped['weighted_mean_AGI']= AGI_grouped['weighted_mean_AGI'].apply(lambda x: round(x, 0))
In [77]:
AGI_grouped.head()
Out[77]:
In [79]:
# drop columns you won't need
AGI_grouped.drop(['AGI_STUB','amount','actual_amount'],axis=1,inplace=True)
In [81]:
AGI_grouped.head()
Out[81]:
In [37]:
# also look into pandas.Series.unique
AGI_subset_geo = AGI[['zipcode','STATE']].drop_duplicates()
In [82]:
AGI_subset_geo
Out[82]:
In [83]:
#merge rather than join if you want to use a common column other than the index
AGI_final = pd.merge(AGI_grouped, AGI_subset_geo, how='left', on='zipcode')
In [84]:
AGI_final.head()
Out[84]:
In [56]:
# this gives you the greated weighted_mean_AGI first
AGI_final.sort('weighted_mean_AGI',ascending=False).head()
Out[56]:
In [89]:
# chain methods!
AGI_final.groupby('STATE').mean().sort('weighted_mean_AGI',ascending=False)
Out[89]:
In [88]:
AGI_final.sort('weighted_mean_AGI').head().plot(kind='bar')
Out[88]:
Python for Data Analysis written by Wes McKinney, creator of pandas
The inimitable Julia Evans' pandas cookbook
Cyrille Rossant's books on IPython