In [55]:
import numpy as np
import pandas as pd
from IPython.display import display, Image, YouTubeVideo
%matplotlib inline
Cost -- compare capabilities between software you already use and open source here
Allows a diversity of platforms on a given team
They're amenable to sketching, and they're amenable to reproducibility.
You can retrace your own steps and also make a narrative for someone else to follow.
Built-in documentation streamlines your workflow; magic methods anticipate it.
pd.re from math import
In [2]:
?pd.read_csv
$\bar{x} = \frac{\sum_{i=1}^{n}w_i\cdot x_i}{\sum_{i=1}^{n}w_i}$
x <- c(0:10, 50)
xm <- mean(x)
c(xm, mean(x, trim = 0.10))
The final cell type is "Raw NBConvert"
In [3]:
%quickref
In [4]:
%%python2
print "hello world"
In [5]:
%history
In [6]:
# list the available line magics
%lsmagic
Out[6]:
In [7]:
YouTubeVideo("L4Hbv4ugUWk")
Out[7]:
In [8]:
!ls
Use what's here or roll your own.
Source: IRS.gov
In [9]:
pd.read_csv?
In [10]:
# read in a CSV
# specify that zipcode should be treated as a string rather than an int!
AGI = pd.read_csv('IRS_data/12zpallagi.csv',dtype={'zipcode': str})
In [11]:
AGI.info()
In [12]:
# you can select columns by label or position!
AGI_column_subset = AGI[['STATE','AGI_STUB','zipcode','N1','A00100']]
In [13]:
# get information about type for a given field, and how many values you can expect for each
AGI_column_subset.info()
In [14]:
AGI_column_subset.describe()
Out[14]:
In [15]:
# note this is inclusive!
AGI_column_subset.ix[6:11]
Out[15]:
In [16]:
AGI_column_subset = AGI_column_subset.rename(columns={'N1':'population','A00100':'amount'})
# AGI_column_subset.rename(columns={'N1':'population','A00100':'amount'},inplace=True)
In [17]:
# easy way to look at a subset of data without having to specify rows
AGI_column_subset.head()
Out[17]:
In [18]:
# group by zipcode and sum other values, resetting index
AGI_grouped = AGI_column_subset.groupby('zipcode').sum().reset_index()
In [19]:
AGI_grouped.head()
Out[19]:
In [20]:
AGI_grouped['population'].mean()
Out[20]:
In [21]:
#this can also be done using the na_values parameter upon being read in
# null_zips = (AGI_grouped['zipcode'] == '00000')
null_zips = AGI_grouped['zipcode'].isin(['00000', '99999'])
AGI_grouped.loc[null_zips, 'zipcode'] = np.nan
In [22]:
AGI_grouped.head()
Out[22]:
In [23]:
AGI_notnull = AGI_grouped.dropna()
In [24]:
AGI_notnull['population'].mean()
Out[24]:
In [25]:
AGI_grouped.dropna(inplace=True)
In [26]:
# make a new column with the real amount, not in thousands
AGI_grouped['actual_amount'] = AGI_grouped['amount'] * 1000
Keep in mind you have options, and use magic methods to test implementation inline!
In [27]:
%timeit applied = AGI_grouped['amount'].apply(lambda x: x * 1000)
In [28]:
#being vectorized operations, this is happening at the C level and thereby much faster
%timeit vectorized = AGI_grouped['amount'] * 1000
In [29]:
AGI_grouped.tail()
Out[29]:
In [30]:
# make a mean, using standard math operations!
AGI_grouped['weighted_mean_AGI'] = AGI_grouped['actual_amount']/AGI_grouped['population']
In [31]:
#use anonymous functions to change every value in a column!
#because this is an apply, much slower
AGI_grouped['weighted_mean_AGI']= AGI_grouped['weighted_mean_AGI'].apply(lambda x: round(x, 0))
In [32]:
AGI_grouped.info()
In [33]:
# drop columns you won't need
AGI_grouped.drop(['AGI_STUB','amount','actual_amount'],axis=1,inplace=True)
In [34]:
AGI_grouped.head()
Out[34]:
In [35]:
# also look into pandas.Series.unique
AGI_subset_geo = AGI[['zipcode','STATE']].drop_duplicates()
In [36]:
AGI_subset_geo
Out[36]:
In [37]:
#merge rather than join if you want to use a common column other than the index
AGI_final = pd.merge(AGI_grouped, AGI_subset_geo, how='left', on='zipcode')
In [38]:
AGI_final.tail()
Out[38]:
In [39]:
# this gives you the greatest weighted_mean_AGI first
AGI_final.sort_values(by='weighted_mean_AGI',ascending=False).head()
Out[39]:
In [40]:
# chain methods!
AGI_final.groupby('STATE').mean().sort_values(by='weighted_mean_AGI',ascending=False)
Out[40]:
In [41]:
# not sure if this is PEP8-compliant
top_5_states = AGI_final.groupby('STATE').mean().sort_values(
by='weighted_mean_AGI',ascending=False).reset_index().head()
In [42]:
top_5_states.plot(kind='bar',x='STATE')
Out[42]:
In [43]:
# `cat` is an easy way to examine file contents in place
# !cat tourism_data/581a4d76-9f6d-4786-b22c-73d59627d1e2_v2.csv
In [44]:
# read in a CSV
df = pd.read_csv('tourism_data/581a4d76-9f6d-4786-b22c-73d59627d1e2_v2.csv',skiprows=4)
# df = pd.read_csv('tourism_data/581a4d76-9f6d-4786-b22c-73d59627d1e2_v2.csv')
In [45]:
# get information about type for a given field, and how many values you can expect for each
df.info()
In [46]:
df_subset = df.dropna(axis=1,how='all')
In [47]:
df_subset.info()
In [48]:
# df_subset.drop(['Indicator Name','Indicator Code'], axis=1, inplace=True)
df_subset = df_subset.drop(['Indicator Name','Indicator Code'], axis=1)
In [49]:
df_subset.head()
Out[49]:
In [50]:
df_melted = pd.melt(df_subset, id_vars=['Country Name', 'Country Code'])
In [51]:
df_melted
Out[51]:
In [52]:
df_melted.rename(columns={'variable':'Year','value':'Tourists'},inplace=True)
In [53]:
df_melted[df_melted['Country Code']== 'DOM']
Out[53]:
In [54]:
df_melted[df_melted['Country Code']== 'DOM'].plot(x='Year')
Out[54]: