In [1]:
# conventional way to import pandas
import pandas as pd
# get Pansda's vesrion #
print ('Pandas version', pd.__version__)
In [2]:
# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame
orders = pd.read_table('data/chipotle.tsv')
In [3]:
# examine the first 5 rows
orders.head()
Out[3]:
Documentation for read_table
In [4]:
users = pd.read_table('data/u.user')
# examine the first 5 rows
users.head()
Out[4]:
In [5]:
users = pd.read_table('data/u.user', sep='|')
# examine the first 5 rows
users.head()
Out[5]:
In [6]:
users = pd.read_table('data/u.user', sep='|', header=None)
# examine the first 5 rows
users.head()
Out[6]:
In [7]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('data/u.user', sep='|', header=None, names=user_cols)
# examine the first 5 rows
users.head()
Out[7]:
In [7]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_table('data/ufo.csv', sep=',')
In [8]:
# read_csv is equivalent to read_table, except it assumes a comma separator
ufo = pd.read_csv('data/ufo.csv')
type(ufo)
Out[8]:
In [9]:
# examine the first 5 rows
ufo.head()
Out[9]:
In [10]:
# select the 'City' Series using bracket notation
ufo['Colors Reported']
Out[10]:
In [16]:
type(ufo['City'])
Out[16]:
In [17]:
# or equivalently, use dot notation - see notes below
ufo.City
Out[17]:
Bracket notation will always work, whereas dot notation has limitations:
In [10]:
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.head()
Out[10]:
In [11]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
Methods end with parentheses, while attributes don't:
In [12]:
# example method: show the first 5 rows
movies.head()
Out[12]:
In [13]:
# example method: calculate summary statistics
movies.describe()
Out[13]:
In [14]:
# example attribute: number of rows and columns
movies.shape
Out[14]:
In [15]:
# example attribute: data type of each column
movies.dtypes
Out[15]:
In [16]:
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])
Out[16]:
Documentation for describe
In [11]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
In [12]:
# examine the column names
ufo.columns
Out[12]:
In [13]:
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported', 'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns
Out[13]:
Documentation for rename
In [22]:
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns
Out[22]:
In [23]:
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv('data/ufo.csv', header=0, names=ufo_cols)
ufo.head()
Out[23]:
Documentation for read_csv
In [24]:
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns
Out[24]:
Documentation for str.replace
In [25]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head()
Out[25]:
In [24]:
# remove a single column (axis=1 refers to columns)
ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.head()
Out[24]:
Documentation for drop
In [25]:
# remove multiple columns at once
ufo.drop(['City', 'State'], axis=1, inplace=True)
ufo.head()
Out[25]:
In [26]:
# remove multiple rows at once (axis=0 refers to rows)
ufo.drop([0, 1], axis=0, inplace=True)
ufo.head()
Out[26]:
In [28]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
Out[28]:
Note: None of the sorting methods below affect the underlying data. (In other words, the sorting is temporary).
In [29]:
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values()
Out[29]:
In [29]:
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
Out[29]:
Documentation for sort_values
for a Series. (Prior to version 0.17, use order
instead.)
In [31]:
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()
Out[31]:
In [33]:
# sort in descending order instead
movies.sort_values('title', ascending=False).head()
Out[33]:
Documentation for sort_values
for a DataFrame. (Prior to version 0.17, use sort
instead.)
In [32]:
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()
Out[32]:
Summary of changes to the sorting API in pandas 0.17
In [35]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
Out[35]:
In [34]:
# examine the number of rows and columns
movies.shape
Out[34]:
Goal: Filter the DataFrame rows to only show movies with a 'duration' of at least 200 minutes.
In [35]:
# create a list in which each element refers to a DataFrame row: True if the row satisfies the condition, False otherwise
booleans = []
for length in movies.duration:
if length >= 200:
booleans.append(True)
else:
booleans.append(False)
In [36]:
# confirm that the list has the same length as the DataFrame
len(booleans)
Out[36]:
In [37]:
# examine the first five list elements
booleans[0:5]
Out[37]:
In [38]:
# convert the list to a Series
is_long = pd.Series(booleans)
is_long.head()
Out[38]:
In [39]:
# use bracket notation with the boolean Series to tell the DataFrame which rows to display
movies[is_long]
Out[39]:
In [40]:
# simplify the steps above: no need to write a for loop to create 'is_long' since pandas will broadcast the comparison
is_long = movies.duration >= 200
movies[is_long]
# or equivalently, write it in one line (no need to create the 'is_long' object)
movies[movies.duration >= 200]
Out[40]:
In [41]:
# select the 'genre' Series from the filtered DataFrame
is_long = movies.duration >= 200
movies[is_long].genre
# or equivalently, use the 'loc' method
movies.loc[movies.duration >= 200, 'genre']
Out[41]:
Documentation for loc
In [36]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
Out[36]:
In [37]:
# filter the DataFrame to only show movies with a 'duration' of at least 200 minutes
movies[movies.duration >= 200]
Out[37]:
Understanding logical operators:
and
: True only if both sides of the operator are Trueor
: True if either side of the operator is True
In [44]:
# demonstration of the 'and' operator
print(True and True)
print(True and False)
print(False and False)
In [45]:
# demonstration of the 'or' operator
print(True or True)
print(True or False)
print(False or False)
Rules for specifying multiple filter criteria in pandas:
&
instead of and
|
instead of or
Goal: Further filter the DataFrame of long movies (duration >= 200) to only show movies which also have a 'genre' of 'Drama'
In [46]:
# CORRECT: use the '&' operator to specify that both conditions are required
movies[(movies.duration >=200) & (movies.genre == 'Drama')]
Out[46]:
In [47]:
# INCORRECT: using the '|' operator would have shown movies that are either long or dramas (or both)
movies[(movies.duration >=200) | (movies.genre == 'Drama')].head()
Out[47]:
Goal: Filter the original DataFrame to show movies with a 'genre' of 'Crime' or 'Drama' or 'Action'
In [44]:
# use the '|' operator to specify that a row can match any of the three criteria
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].tail(20)
# or equivalently, use the 'isin' method
#movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)
Out[44]:
Documentation for isin
Question: When reading from a file, how do I read in only a subset of the columns?
In [49]:
# read a dataset of UFO reports into a DataFrame, and check the columns
ufo = pd.read_csv('data/ufo.csv')
ufo.columns
Out[49]:
In [50]:
# specify which columns to include by name
ufo = pd.read_csv('data/ufo.csv', usecols=['City', 'State'])
# or equivalently, specify columns by position
ufo = pd.read_csv('data/ufo.csv', usecols=[0, 4])
ufo.columns
Out[50]:
Question: When reading from a file, how do I read in only a subset of the rows?
In [51]:
# specify how many rows to read
ufo = pd.read_csv('data/ufo.csv', nrows=3)
ufo
Out[51]:
Documentation for read_csv
Question: How do I iterate through a Series?
In [52]:
# Series are directly iterable (like a list)
for c in ufo.City:
print(c)
Question: How do I iterate through a DataFrame?
In [53]:
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
print(index, row.City, row.State)
Documentation for iterrows
Question: How do I drop all non-numeric columns from a DataFrame?
In [45]:
# read a dataset of alcohol consumption into a DataFrame, and check the data types
drinks = pd.read_csv('data/drinks.csv')
drinks.dtypes
Out[45]:
In [55]:
# only include numeric columns in the DataFrame
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
Out[55]:
Documentation for select_dtypes
Question: How do I know whether I should pass an argument as a string or a list?
In [56]:
# describe all of the numeric columns
drinks.describe()
Out[56]:
In [57]:
# pass the string 'all' to describe all columns
drinks.describe(include='all')
Out[57]:
In [58]:
# pass a list of data types to only describe certain types
drinks.describe(include=['object', 'float64'])
Out[58]:
In [59]:
# pass a list even if you only want to describe a single data type
drinks.describe(include=['object'])
Out[59]:
Documentation for describe
In [60]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
Out[60]:
In [61]:
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()
Out[61]:
Documentation for drop
In [62]:
# drop a row (temporarily)
drinks.drop(2, axis=0).head()
Out[62]:
When referring to rows or columns with the axis parameter:
In [46]:
# calculate the mean of each numeric column
drinks.mean()
# or equivalently, specify the axis explicitly
drinks.mean(axis=0)
Out[46]:
Documentation for mean
In [64]:
# calculate the mean of each row
drinks.mean(axis=1).head()
Out[64]:
When performing a mathematical operation with the axis parameter:
In [65]:
# 'index' is an alias for axis 0
drinks.mean(axis='index')
Out[65]:
In [66]:
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()
Out[66]:
In [67]:
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('data/chipotle.tsv')
orders.head()
Out[67]:
In [68]:
# normal way to access string methods in Python
'hello'.upper()
Out[68]:
In [69]:
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
Out[69]:
In [70]:
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
Out[70]:
In [71]:
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()
Out[71]:
In [72]:
# string methods can be chained together
orders.choice_description.str.replace('[', '').str.replace(']', '').head()
Out[72]:
In [73]:
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()
Out[73]:
String handling section of the pandas API reference
In [74]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
Out[74]:
In [75]:
# examine the data type of each Series
drinks.dtypes
Out[75]:
In [76]:
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes
Out[76]:
Documentation for astype
In [77]:
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('data/drinks.csv', dtype={'beer_servings':float})
drinks.dtypes
Out[77]:
In [78]:
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('data/chipotle.tsv')
orders.head()
Out[78]:
In [79]:
# examine the data type of each Series
orders.dtypes
Out[79]:
In [80]:
# convert a string to a number in order to do math
orders.item_price.str.replace('$', '').astype(float).mean()
Out[80]:
In [81]:
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
Out[81]:
In [82]:
# convert a boolean Series to an integer (False = 0, True = 1)
orders.item_name.str.contains('Chicken').astype(int).head()
Out[82]:
In [83]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.tsv')
drinks.head()
Out[83]:
In [84]:
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()
Out[84]:
In [85]:
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()
Out[85]:
In [86]:
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()
Out[86]:
Documentation for groupby
In [87]:
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()
Out[87]:
In [88]:
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])
Out[88]:
Documentation for agg
In [89]:
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()
Out[89]:
In [90]:
# allow plots to appear in the notebook
%matplotlib inline
In [91]:
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')
Out[91]:
Documentation for plot
In [92]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imbd_1000.csv')
movies.head()
Out[92]:
In [93]:
# examine the data type of each Series
movies.dtypes
Out[93]:
Exploring a non-numeric Series:
In [94]:
# count the non-null values, unique values, and frequency of the most common value
movies.genre.describe()
Out[94]:
Documentation for describe
In [95]:
# count how many times each value in the Series occurs
movies.genre.value_counts()
Out[95]:
Documentation for value_counts
In [96]:
# display percentages instead of raw counts
movies.genre.value_counts(normalize=True)
Out[96]:
In [97]:
# 'value_counts' (like many pandas methods) outputs a Series
type(movies.genre.value_counts())
Out[97]:
In [98]:
# thus, you can add another Series method on the end
movies.genre.value_counts().head()
Out[98]:
In [99]:
# display the unique values in the Series
movies.genre.unique()
Out[99]:
In [100]:
# count the number of unique values in the Series
movies.genre.nunique()
Out[100]:
In [101]:
# compute a cross-tabulation of two Series
pd.crosstab(movies.genre, movies.content_rating)
Out[101]:
Documentation for crosstab
Exploring a numeric Series:
In [102]:
# calculate various summary statistics
movies.duration.describe()
Out[102]:
In [103]:
# many statistics are implemented as Series methods
movies.duration.mean()
Out[103]:
Documentation for mean
In [104]:
# 'value_counts' is primarily useful for categorical data, not numerical data
movies.duration.value_counts().head()
Out[104]:
In [105]:
# allow plots to appear in the notebook
%matplotlib inline
In [106]:
# histogram of the 'duration' Series (shows the distribution of a numerical variable)
movies.duration.plot(kind='hist')
Out[106]:
In [107]:
# bar plot of the 'value_counts' for the 'genre' Series
movies.genre.value_counts().plot(kind='bar')
Out[107]:
Documentation for plot
In [108]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.tail()
Out[108]:
What does "NaN" mean?
numpy.nan
.read_csv
detects missing values (by default) when reading the file, and replaces them with this special value.Documentation for read_csv
In [109]:
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
ufo.isnull().tail()
Out[109]:
In [110]:
# 'nonnull' returns the opposite of 'isnull' (True if not missing, False if missing)
ufo.notnull().tail()
Out[110]:
In [111]:
# count the number of missing values in each Series
ufo.isnull().sum()
Out[111]:
This calculation works because:
sum
method for a DataFrame operates on axis=0
by default (and thus produces column sums).True
to 1 and False
to 0.
In [112]:
# use the 'isnull' Series method to filter the DataFrame rows
ufo[ufo.City.isnull()].head()
Out[112]:
How to handle missing values depends on the dataset as well as the nature of your analysis. Here are some options:
In [113]:
# examine the number of rows and columns
ufo.shape
Out[113]:
In [114]:
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape
Out[114]:
Documentation for dropna
In [115]:
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
Out[115]:
In [116]:
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape
Out[116]:
In [117]:
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
Out[117]:
In [118]:
# if 'all' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
Out[118]:
In [119]:
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()
Out[119]:
In [120]:
# explicitly include missing values
ufo['Shape Reported'].value_counts(dropna=False).head()
Out[120]:
Documentation for value_counts
In [121]:
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
Documentation for fillna
In [122]:
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head()
Out[122]:
In [123]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
Out[123]:
In [124]:
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
Out[124]:
In [125]:
# column names are also stored in a special "index" object
drinks.columns
Out[125]:
In [126]:
# neither the index nor the columns are included in the shape
drinks.shape
Out[126]:
In [127]:
# index and columns both default to integers if you don't define them
pd.read_table('data/imbd_1000.csv', header=None, sep='|').head()
Out[127]:
What is the index used for?
In [128]:
# identification: index remains with each row when filtering the DataFrame
drinks[drinks.continent=='South America']
Out[128]:
In [129]:
# selection: select a portion of the DataFrame using the index
drinks.loc[23, 'beer_servings']
Out[129]:
Documentation for loc
In [130]:
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()
Out[130]:
Documentation for set_index
In [131]:
# 'country' is now the index
drinks.index
Out[131]:
In [132]:
# 'country' is no longer a column
drinks.columns
Out[132]:
In [133]:
# 'country' data is no longer part of the DataFrame contents
drinks.shape
Out[133]:
In [134]:
# country name can now be used for selection
drinks.loc['Brazil', 'beer_servings']
Out[134]:
In [135]:
# index name is optional
drinks.index.name = None
drinks.head()
Out[135]:
In [136]:
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()
Out[136]:
Documentation for reset_index
In [137]:
# many DataFrame methods output a DataFrame
drinks.describe()
Out[137]:
In [138]:
# you can interact with any DataFrame using its index and columns
drinks.describe().loc['25%', 'beer_servings']
Out[138]:
In [139]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
Out[139]:
In [140]:
# every DataFrame has an index
drinks.index
Out[140]:
In [141]:
# every Series also has an index (which carries over from the DataFrame)
drinks.continent.head()
Out[141]:
In [142]:
# set 'country' as the index
drinks.set_index('country', inplace=True)
Documentation for set_index
In [143]:
# Series index is on the left, values are on the right
drinks.continent.head()
Out[143]:
In [144]:
# another example of a Series (output from the 'value_counts' method)
drinks.continent.value_counts()
Out[144]:
Documentation for value_counts
In [145]:
# access the Series index
drinks.continent.value_counts().index
Out[145]:
In [146]:
# access the Series values
drinks.continent.value_counts().values
Out[146]:
In [147]:
# elements in a Series can be selected by index (using bracket notation)
drinks.continent.value_counts()['Africa']
Out[147]:
In [148]:
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
Out[148]:
In [149]:
# any Series can also be sorted by its index
drinks.continent.value_counts().sort_index()
Out[149]:
Documentation for sort_values
and sort_index
What is the index used for?
In [150]:
# 'beer_servings' Series contains the average annual beer servings per person
drinks.beer_servings.head()
Out[150]:
In [151]:
# create a Series containing the population of two countries
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name='population')
people
Out[151]:
Documentation for Series
In [152]:
# calculate the total annual beer servings for each country
(drinks.beer_servings * people).head()
Out[152]:
In [153]:
# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)
pd.concat([drinks, people], axis=1).head()
Out[153]:
In [154]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head(3)
Out[154]:
The loc
method is used to select rows and columns by label. You can pass it:
In [155]:
# row 0, all columns
ufo.loc[0, :]
Out[155]:
In [156]:
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]
Out[156]:
In [157]:
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]
Out[157]:
In [158]:
# this implies "all columns", but explicitly stating "all columns" is better
ufo.loc[0:2]
Out[158]:
In [159]:
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']
Out[159]:
In [160]:
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]
Out[160]:
In [161]:
# accomplish the same thing using double brackets - but using 'loc' is preferred since it's more explicit
ufo[['City', 'State']].head(3)
Out[161]:
In [162]:
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']
Out[162]:
In [163]:
# accomplish the same thing using 'head' and 'drop'
ufo.head(3).drop('Time', axis=1)
Out[163]:
In [164]:
# rows in which the 'City' is 'Oakland', column 'State'
ufo.loc[ufo.City=='Oakland', 'State']
Out[164]:
In [165]:
# accomplish the same thing using "chained indexing" - but using 'loc' is preferred since chained indexing can cause problems
ufo[ufo.City=='Oakland'].State
Out[165]:
The iloc
method is used to select rows and columns by integer position. You can pass it:
In [166]:
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]
Out[166]:
In [167]:
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]
Out[167]:
In [168]:
# rows in positions 0 through 2 (exclusive), all columns
ufo.iloc[0:2, :]
Out[168]:
In [169]:
# accomplish the same thing - but using 'iloc' is preferred since it's more explicit
ufo[0:2]
Out[169]:
The ix
method is used to select rows and columns by label or integer position, and should only be used when you need to mix label-based and integer-based selection in the same call.
In [170]:
# read a dataset of alcohol consumption into a DataFrame and set 'country' as the index
drinks = pd.read_csv('data/drinks.csv', index_col='country')
drinks.head()
Out[170]:
In [171]:
# row with label 'Albania', column in position 0
drinks.ix['Albania', 0]
Out[171]:
In [172]:
# row in position 1, column with label 'beer_servings'
drinks.ix[1, 'beer_servings']
Out[172]:
Rules for using numbers with ix
:
In [173]:
# rows 'Albania' through 'Andorra' (inclusive), columns in positions 0 through 2 (exclusive)
drinks.ix['Albania':'Andorra', 0:2]
Out[173]:
In [174]:
# rows 0 through 2 (inclusive), columns in positions 0 through 2 (exclusive)
ufo.ix[0:2, 0:2]
Out[174]:
In [175]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head()
Out[175]:
In [176]:
ufo.shape
Out[176]:
In [177]:
# remove the 'City' column (doesn't affect the DataFrame since inplace=False)
ufo.drop('City', axis=1).head()
Out[177]:
In [178]:
# confirm that the 'City' column was not actually removed
ufo.head()
Out[178]:
In [179]:
# remove the 'City' column (does affect the DataFrame since inplace=True)
ufo.drop('City', axis=1, inplace=True)
In [180]:
# confirm that the 'City' column was actually removed
ufo.head()
Out[180]:
In [181]:
# drop a row if any value is missing from that row (doesn't affect the DataFrame since inplace=False)
ufo.dropna(how='any').shape
Out[181]:
In [182]:
# confirm that no rows were actually removed
ufo.shape
Out[182]:
In [183]:
# use an assignment statement instead of the 'inplace' parameter
ufo = ufo.set_index('Time')
ufo.tail()
Out[183]:
In [184]:
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='bfill').tail()
Out[184]:
In [185]:
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='ffill').tail()
Out[185]:
In [186]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
Out[186]:
In [187]:
# exact memory usage is unknown because object columns are references elsewhere
drinks.info()
In [188]:
# force pandas to calculate the true memory usage
drinks.info(memory_usage='deep')
In [189]:
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
Out[189]:
Documentation for info
and memory_usage
In [190]:
# use the 'category' data type (new in pandas 0.15) to store the 'continent' strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes
Out[190]:
In [191]:
# 'continent' Series appears to be unchanged
drinks.continent.head()
Out[191]:
In [192]:
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks.continent.cat.codes.head()
Out[192]:
In [193]:
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
Out[193]:
In [194]:
# repeat this process for the 'country' Series
drinks['country'] = drinks.country.astype('category')
drinks.memory_usage(deep=True)
Out[194]:
In [195]:
# memory usage increased because we created 193 categories
drinks.country.cat.categories
Out[195]:
The category data type should only be used with a string Series that has a small number of possible values.
In [196]:
# create a small DataFrame from a dictionary
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df
Out[196]:
In [197]:
# sort the DataFrame by the 'quality' Series (alphabetical order)
df.sort_values('quality')
Out[197]:
In [198]:
# define a logical ordering for the categories
df['quality'] = df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered=True)
df.quality
Out[198]:
In [199]:
# sort the DataFrame by the 'quality' Series (logical order)
df.sort_values('quality')
Out[199]:
In [200]:
# comparison operators work with ordered categories
df.loc[df.quality > 'good', :]
Out[200]:
In [201]:
# read the training dataset from Kaggle's Titanic competition into a DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
Out[201]:
Goal: Predict passenger survival aboard the Titanic based on passenger attributes
In [202]:
# create a feature matrix 'X' by selecting two DataFrame columns
feature_cols = ['Pclass', 'Parch']
X = train.loc[:, feature_cols]
X.shape
Out[202]:
In [203]:
# create a response vector 'y' by selecting a Series
y = train.Survived
y.shape
Out[203]:
Note: There is no need to convert these pandas objects to NumPy arrays. scikit-learn will understand these objects as long as they are entirely numeric and the proper shapes.
In [204]:
# fit a classification model to the training data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, y)
Out[204]:
Video series: Introduction to machine learning with scikit-learn
In [205]:
# read the testing dataset from Kaggle's Titanic competition into a DataFrame
test = pd.read_csv('http://bit.ly/kaggletest')
test.head()
Out[205]:
In [206]:
# create a feature matrix from the testing data that matches the training data
X_new = test.loc[:, feature_cols]
X_new.shape
Out[206]:
In [207]:
# use the fitted model to make predictions for the testing set observations
new_pred_class = logreg.predict(X_new)
In [208]:
# create a DataFrame of passenger IDs and testing set predictions
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).head()
Out[208]:
Documentation for the DataFrame
constructor
In [209]:
# ensure that PassengerID is the first column by setting it as the index
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').head()
Out[209]:
In [210]:
# write the DataFrame to a CSV file that can be submitted to Kaggle
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').to_csv('sub.csv')
Documentation for to_csv
In [211]:
# save a DataFrame to disk ("pickle it")
train.to_pickle('train.pkl')
In [212]:
# read a pickled object from disk ("unpickle it")
pd.read_pickle('train.pkl').head()
Out[212]:
Documentation for to_pickle
and read_pickle
Question: Could you explain how to read the pandas documentation?
Question: What is the difference between ufo.isnull()
and pd.isnull(ufo)
?
In [213]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
Out[213]:
In [214]:
# use 'isnull' as a top-level function
pd.isnull(ufo).head()
Out[214]:
In [215]:
# equivalent: use 'isnull' as a DataFrame method
ufo.isnull().head()
Out[215]:
Documentation for isnull
Question: Why are DataFrame slices inclusive when using .loc
, but exclusive when using .iloc
?
In [216]:
# label-based slicing is inclusive of the start and stop
ufo.loc[0:4, :]
Out[216]:
In [217]:
# position-based slicing is inclusive of the start and exclusive of the stop
ufo.iloc[0:4, :]
Out[217]:
In [218]:
# 'iloc' is simply following NumPy's slicing convention...
ufo.values[0:4, :]
Out[218]:
In [219]:
# ...and NumPy is simply following Python's slicing convention
'python'[0:4]
Out[219]:
In [220]:
# 'loc' is inclusive of the stopping label because you don't necessarily know what label will come after it
ufo.loc[0:4, 'City':'State']
Out[220]:
Question: How do I randomly sample rows from a DataFrame?
In [221]:
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)
ufo.sample(n=3)
Out[221]:
Documentation for sample
In [222]:
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
Out[222]:
In [223]:
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
In [224]:
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
Documentation for isin
In [225]:
# read the training dataset from Kaggle's Titanic competition
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
Out[225]:
In [226]:
# create the 'Sex_male' dummy variable using the 'map' method
train['Sex_male'] = train.Sex.map({'female':0, 'male':1})
train.head()
Out[226]:
Documentation for map
In [227]:
# alternative: use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
Out[227]:
Generally speaking:
In [228]:
# drop the first dummy variable ('female') using the 'iloc' method
pd.get_dummies(train.Sex).iloc[:, 1:].head()
Out[228]:
In [229]:
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()
Out[229]:
In [230]:
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix='Embarked').head(10)
Out[230]:
In [231]:
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)
Out[231]:
How to translate these values back to the original 'Embarked' value:
In [232]:
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
Out[232]:
Documentation for concat
In [233]:
# reset the DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
Out[233]:
In [234]:
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=['Sex', 'Embarked']).head()
Out[234]:
In [235]:
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()
Out[235]:
Documentation for get_dummies
In [236]:
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
Out[236]:
In [237]:
# 'Time' is currently stored as a string
ufo.dtypes
Out[237]:
In [238]:
# hour could be accessed using string slicing, but this approach breaks too easily
ufo.Time.str.slice(-5, -3).astype(int).head()
Out[238]:
In [239]:
# convert 'Time' to datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head()
Out[239]:
In [240]:
ufo.dtypes
Out[240]:
Documentation for to_datetime
In [241]:
# convenient Series attributes are now available
ufo.Time.dt.hour.head()
Out[241]:
In [242]:
ufo.Time.dt.weekday_name.head()
Out[242]:
In [243]:
ufo.Time.dt.dayofyear.head()
Out[243]:
API reference for datetime properties and methods
In [244]:
# convert a single string to datetime format (outputs a timestamp object)
ts = pd.to_datetime('1/1/1999')
ts
Out[244]:
In [245]:
# compare a datetime Series with a timestamp
ufo.loc[ufo.Time >= ts, :].head()
Out[245]:
In [246]:
# perform mathematical operations with timestamps (outputs a timedelta object)
ufo.Time.max() - ufo.Time.min()
Out[246]:
In [247]:
# timedelta objects also have attributes you can access
(ufo.Time.max() - ufo.Time.min()).days
Out[247]:
In [248]:
# allow plots to appear in the notebook
%matplotlib inline
In [249]:
# count the number of UFO reports per year
ufo['Year'] = ufo.Time.dt.year
ufo.Year.value_counts().sort_index().head()
Out[249]:
In [250]:
# plot the number of UFO reports per year (line plot is the default)
ufo.Year.value_counts().sort_index().plot()
Out[250]:
In [251]:
# read a dataset of movie reviewers into a DataFrame
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()
Out[251]:
In [252]:
users.shape
Out[252]:
In [253]:
# detect duplicate zip codes: True if an item is identical to a previous item
users.zip_code.duplicated().tail()
Out[253]:
In [254]:
# count the duplicate items (True becomes 1, False becomes 0)
users.zip_code.duplicated().sum()
Out[254]:
In [255]:
# detect duplicate DataFrame rows: True if an entire row is identical to a previous row
users.duplicated().tail()
Out[255]:
In [256]:
# count the duplicate rows
users.duplicated().sum()
Out[256]:
Logic for duplicated
:
keep='first'
(default): Mark duplicates as True except for the first occurrence.keep='last'
: Mark duplicates as True except for the last occurrence.keep=False
: Mark all duplicates as True.
In [257]:
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep='first'), :]
Out[257]:
In [258]:
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep='last'), :]
Out[258]:
In [259]:
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
Out[259]:
In [260]:
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep='first').shape
Out[260]:
In [261]:
users.drop_duplicates(keep='last').shape
Out[261]:
In [262]:
users.drop_duplicates(keep=False).shape
Out[262]:
Documentation for drop_duplicates
In [263]:
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()
Out[263]:
In [264]:
users.drop_duplicates(subset=['age', 'zip_code']).shape
Out[264]:
In [265]:
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
Out[265]:
In [266]:
# count the missing values in the 'content_rating' Series
movies.content_rating.isnull().sum()
Out[266]:
In [267]:
# examine the DataFrame rows that contain those missing values
movies[movies.content_rating.isnull()]
Out[267]:
In [268]:
# examine the unique values in the 'content_rating' Series
movies.content_rating.value_counts()
Out[268]:
Goal: Mark the 'NOT RATED' values as missing values, represented by 'NaN'.
In [269]:
# first, locate the relevant rows
movies[movies.content_rating=='NOT RATED'].head()
Out[269]:
In [270]:
# then, select the 'content_rating' Series from those rows
movies[movies.content_rating=='NOT RATED'].content_rating.head()
Out[270]:
In [271]:
# finally, replace the 'NOT RATED' values with 'NaN' (imported from NumPy)
import numpy as np
movies[movies.content_rating=='NOT RATED'].content_rating = np.nan
Problem: That statement involves two operations, a __getitem__
and a __setitem__
. pandas can't guarantee whether the __getitem__
operation returns a view or a copy of the data.
__getitem__
returns a view of the data, __setitem__
will affect the 'movies' DataFrame.__getitem__
returns a copy of the data, __setitem__
will not affect the 'movies' DataFrame.
In [272]:
# the 'content_rating' Series has not changed
movies.content_rating.isnull().sum()
Out[272]:
Solution: Use the loc
method, which replaces the 'NOT RATED' values in a single __setitem__
operation.
In [273]:
# replace the 'NOT RATED' values with 'NaN' (does not cause a SettingWithCopyWarning)
movies.loc[movies.content_rating=='NOT RATED', 'content_rating'] = np.nan
In [274]:
# this time, the 'content_rating' Series has changed
movies.content_rating.isnull().sum()
Out[274]:
Summary: Use the loc
method any time you are selecting rows and columns in the same statement.
More information: Modern Pandas (Part 1)
In [275]:
# create a DataFrame only containing movies with a high 'star_rating'
top_movies = movies.loc[movies.star_rating >= 9, :]
top_movies
Out[275]:
Goal: Fix the 'duration' for 'The Shawshank Redemption'.
In [276]:
# overwrite the relevant cell with the correct duration
top_movies.loc[0, 'duration'] = 150
Problem: pandas isn't sure whether 'top_movies' is a view or a copy of 'movies'.
In [277]:
# 'top_movies' DataFrame has been updated
top_movies
Out[277]:
In [278]:
# 'movies' DataFrame has not been updated
movies.head(1)
Out[278]:
Solution: Any time you are attempting to create a DataFrame copy, use the copy
method.
In [279]:
# explicitly create a copy of 'movies'
top_movies = movies.loc[movies.star_rating >= 9, :].copy()
In [280]:
# pandas now knows that you are updating a copy instead of a view (does not cause a SettingWithCopyWarning)
top_movies.loc[0, 'duration'] = 150
In [281]:
# 'top_movies' DataFrame has been updated
top_movies
Out[281]:
Documentation on indexing and selection: Returning a view versus a copy
Stack Overflow: What is the point of views in pandas if it is undefined whether an indexing operation returns a view or a copy?