In [ ]:
from __future__ import division
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import rpy2
from IPython.display import display, Image, YouTubeVideo
%matplotlib inline
Cost -- compare capabilities between software you already use and open source here
Allows a diversity of platforms on a given team
They're amenable to sketching, and they're amenable to reproducibility.
You can retrace your own steps and also make a narrative for someone else to follow.
Built-in documentation improves workflow; magic methods anticipate data analysis needs.
In [ ]:
pd.re
from math import
In [ ]:
?pd.read_csv
In [ ]:
%quickref
In [ ]:
%lsmagic
In [ ]:
%load_ext rmagic
In [ ]:
%R x <- c(0:10, 50)
%R xm <- mean(x)
%R c(xm, mean(x, trim = 0.10))
In [ ]:
YouTubeVideo("L4Hbv4ugUWk")
In [ ]:
!ls
Use what's here or roll your own.
Source: IRS.gov
In [ ]:
?pd.read_csv()
In [ ]:
# read in a CSV
# specify that zipcode should be treated as a string rather than an int!
AGI = pd.read_csv('12zpallagi.csv',dtype={'zipcode': str})
In [ ]:
AGI.info()
In [ ]:
# you can select columns by label or position!
AGI_column_subset = AGI[['STATE','AGI_STUB','zipcode','N1','A00100']]
In [ ]:
# get information about type for a given field, and how many values you can expect for each
AGI_column_subset.info()
In [ ]:
AGI_column_subset.describe()
In [ ]:
# note this is inclusive!
AGI_row_subset = AGI_column_subset.ix[6:11]
In [ ]:
AGI_row_subset
In [ ]:
AGI_column_subset.rename(columns={'N1':'population','A00100':'amount'},inplace=True)
In [ ]:
AGI_column_subset.head()
In [ ]:
# group by zipcode and sum other values, resetting index
AGI_grouped = AGI_column_subset.groupby('zipcode').sum().reset_index()
In [ ]:
AGI_grouped.head()
In [ ]:
AGI_grouped['population'].mean()
In [ ]:
#this can also be done using the na_values param upon being read in
null_zips = (AGI_grouped['zipcode'] == '00000')
AGI_grouped.loc[null_zips, 'zipcode'] = np.nan
In [ ]:
AGI_grouped.head()
In [ ]:
AGI_notnull = AGI_grouped.dropna()
In [ ]:
AGI_notnull['population'].mean()
In [ ]:
AGI_grouped.dropna(inplace=True)
In [ ]:
# make a new column with the real amount, not in thousands
AGI_grouped['actual_amount'] = AGI_grouped['amount'] * 1000
Keep in mind you have options, and use magic methods to test implementation inline!
In [ ]:
%timeit applied = AGI_grouped['amount'].apply(lambda x: x * 1000)
In [ ]:
#being vectorized operations, this is happening at the C level and thereby much faster
%timeit vectorized = AGI_grouped['amount'] * 1000
In [ ]:
AGI_grouped
In [ ]:
# make a mean, using standard math operations!
AGI_grouped['weighted_mean_AGI'] = AGI_grouped['actual_amount']/AGI_grouped['population']
In [ ]:
#use anonymous functions to change every value in a column!
#because this is an apply, much slower
AGI_grouped['weighted_mean_AGI']= AGI_grouped['weighted_mean_AGI'].apply(lambda x: round(x, 0))
In [ ]:
AGI_grouped.info()
In [ ]:
AGI_grouped.describe()
In [ ]:
# drop columns you won't need
AGI_grouped.drop(['AGI_STUB','amount','actual_amount'],axis=1,inplace=True)
In [ ]:
AGI_grouped.head()
In [ ]:
# also look into pandas.Series.unique
AGI_subset_geo = AGI[['zipcode','STATE']].drop_duplicates()
In [ ]:
AGI_subset_geo.head()
In [ ]:
#merge rather than join if you want to use a common column other than the index
AGI_final = pd.merge(AGI_grouped, AGI_subset_geo, how='left', on='zipcode')
In [ ]:
AGI_final.head()
In [ ]:
# this gives you the greatest weighted_mean_AGI first
AGI_final.sort('weighted_mean_AGI',ascending=False).head()
In [ ]:
# chain methods!
AGI_final.groupby('STATE').mean().sort('weighted_mean_AGI',ascending=False)
In [ ]:
AGI_final.sort('weighted_mean_AGI').head().plot(kind='bar')
Python for Data Analysis written by Wes McKinney, creator of pandas
Python Data Science Handbook written by Jake VanderPlas
The inimitable Julia Evans' pandas cookbook
Cyrille Rossant's books on IPython