Use conda to install seaborn by running in a terminal
conda install seaborn
In [ ]:
# imports a library 'pandas', names it as 'pd'
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# enables inline plots, without it plots don't show up in the notebook
%matplotlib inline
Python has long been great for data munging and preparation, but less so for data analysis and modeling. pandas helps fill this gap, enabling you to carry out your entire data analysis workflow in Python without having to switch to a more domain specific language like R.
Combined with the excellent IPython toolkit and other libraries, the environment for doing data analysis in Python excels in performance, productivity, and the ability to collaborate.
pandas does not implement significant modeling functionality outside of linear and panel regression; for this, look to statsmodels and scikit-learn. More work is still needed to make Python a first class statistical modeling environment, but we are well on our way toward that goal.
In [ ]:
# various options in pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 3)
"Census Income" dataset
In [ ]:
# download the data and name the columns
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'ethnicity',
'gender', 'capital_gain', 'capital_loss', 'hours_per_week',
'country_of_origin', 'income']
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
names = cols)
pandas can load a lot more than csvs, this tutorial shows how pandas can read excel, sql, and even copy and paste...
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/
In [ ]:
# we can see there are no null values
# columns with numberical values are type int64, no need to set data type
df.info()
In [ ]:
# to view the first 5 or specify with ex: .head(10)
df.head()
In [6]:
# there's a space before each string in this data
df.education.unique()
Out[6]:
In [7]:
# looks like it's in every object column
df.workclass.unique()
Out[7]:
In [8]:
# loop through each column and strip all the spaces
for col in df:
if df[col].dtype == 'O':
df[col] = df[col].map(lambda x: x.strip(' '))
In [ ]:
# Here's a break down of what that for loop is doing
In [ ]:
# loops through df and gets the column names
for col in df:
print col
In [ ]:
# gets the column type
df.education.dtype
In [ ]:
# if True then applys the map function
df.education.dtype == object
In [ ]:
# strip function
x = ' string'
x.strip(' ')
In [ ]:
# lambda creates a 'throw away' or 'anonymous' function
strip_string = lambda x: x.strip(' ')
strip_string(' string')
In [ ]:
# same as this
def strip_string2(x):
x = x.strip(' ')
return x
In [ ]:
strip_string2(' string')
In [ ]:
# map applies the function to each item in the data frame column so
df[col].map(lambda x: x.strip(' '))
# does the same thing as
df['workclass'].map(strip_string2)
# but in the first case we don't have to define and name a function
for more info on lambda and map
In [9]:
df.education.value_counts()
Out[9]:
In [10]:
df.hours_per_week.mean()
Out[10]:
In [11]:
df[['age', 'capital_gain', 'capital_loss', 'hours_per_week']].describe()
Out[11]:
In [12]:
# as we saw with df.info() there are no nulls...
# but if there were this would find the rows where age is null
df[df.age.isnull()]
Out[12]:
In [13]:
# you could drop all those rows with
df_no_nulls = df[df.age.notnull()]
In [14]:
null_df = pd.DataFrame([1,2,4,np.nan], columns = ['column1'])
In [15]:
null_df
Out[15]:
In [16]:
# you can also fill nulls with a value or string
null_df.column1.fillna(1000)
Out[16]:
In [17]:
null_df.column1.fillna(null_df.column1.median())
Out[17]:
In [18]:
null_df.column1.fillna('string')
Out[18]:
In [19]:
# select a row
df.iloc[0]
Out[19]:
In [ ]:
# select a range of rows
df.iloc[10:15]
In [20]:
# last 2 rows
df.iloc[-2:]
Out[20]:
In [21]:
# selecting every other row in columns 3-5
df.iloc[::2, 2:5].head()
Out[21]:
In [22]:
df.loc[0:2, ['age', 'relationship ']]
Out[22]:
by label
.loc[]
by integer position
.iloc[]
for both
.ix[]
In [23]:
# pd.DataFrame let's you turn series, arrays, lists, and more into data frame structures
df_index = pd.DataFrame([[1,2,3,4],[2,4,6,8],[3,5,7,9]], [11,13,12], columns = ['A', 'C', 'D', 'B'])
In [24]:
df_index
Out[24]:
In [25]:
# iloc indexes by postion, not by the labels in the index
df_index.iloc[0:1]
Out[25]:
In [26]:
# with loc both the start and the stop are included
df_index.loc[11:12]
Out[26]:
In [27]:
# select columns by position
df_index.iloc[:,0:1]
Out[27]:
In [28]:
# or by label
df_index.loc[:,'A':'D']
Out[28]:
In [29]:
# ix: primarily label based, but will fall back to integer positional access
df_index.ix[:,'A':'C']
Out[29]:
In [30]:
# ix: primarily label based, but will fall back to integer positional access
df_index.ix[:,0:2]
Out[30]:
In [31]:
df.columns
Out[31]:
In [36]:
# replace a column name
df.rename(columns = {'native_country' : 'country_of_origin'}, inplace = False)
Out[36]:
In [37]:
df.native_country.unique()
Out[37]:
In [38]:
df[df.native_country == 'United-States'].head()
Out[38]:
In [39]:
df[(df.native_country != 'United-States') & (df.education_num > 9)].head()
Out[39]:
In [40]:
df[(df.native_country != 'United-States') & (df.education_num > 9)].income.value_counts()
Out[40]:
In [41]:
# How to groupby column and apply a function like sum, count, or mean
df.groupby(['relationship']).mean()
Out[41]:
In [42]:
# To groupby multiple columns with multiple functions attached
df.groupby(['income', 'native_country']).age.agg(['count', 'mean'])
# grouped in order of which column is listed first
Out[42]:
In [43]:
# combine groupby with boolean
df[df.native_country != 'United-States'].groupby(['education']).hours_per_week.mean()
Out[43]:
In [44]:
df.age.hist(bins = 18);
In [45]:
# split the histograms by another column (with relatively few unique values)
df.hours_per_week.hist(by = df.income, bins = 25, figsize = (10,5));
In [46]:
# use value_counts() and a bar plot
df['workclass'].value_counts().plot(kind = 'bar')
Out[46]:
In [47]:
df.boxplot(['age'], by = 'relationship');
In [48]:
df.plot(kind='scatter',
x = 'age',
y ='hours_per_week',
alpha = .25,
figsize = (10,5))
Out[48]:
In [ ]:
# how many males and females are in this data set
In [ ]:
In [ ]:
# plot the total number of people in each occupation
In [ ]:
In [ ]:
# what is the lowest average age of an occupation
In [ ]:
In [ ]:
# create a boxplot of hours per week by education level
In [ ]:
In [ ]:
# create a new column for income where >50K = 1 and <=50K = 0
# hint... http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html
In [ ]:
In [ ]:
# find which "native_country" has the highest percent of people earning >50K
In [ ]:
In [ ]:
# visualize what percent of people earn over 50K by education level
In [ ]:
In [ ]:
# make a hexbin plot
In [ ]:
In [ ]:
# check out plotting with Seaborn