In [1]:
    
# conventional way to import pandas
import pandas as pd
# get Pansda's vesrion #
print ('Pandas version', pd.__version__)
    
    
In [2]:
    
# read a dataset of Chipotle orders directly from a URL and store the results in a DataFrame
orders = pd.read_table('data/chipotle.tsv')
    
In [3]:
    
# examine the first 5 rows
orders.head()
    
    Out[3]:
Documentation for read_table
In [4]:
    
users = pd.read_table('data/u.user')
# examine the first 5 rows
users.head()
    
    Out[4]:
In [5]:
    
users = pd.read_table('data/u.user', sep='|')
# examine the first 5 rows
users.head()
    
    Out[5]:
In [6]:
    
users = pd.read_table('data/u.user', sep='|', header=None)
# examine the first 5 rows
users.head()
    
    Out[6]:
In [7]:
    
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('data/u.user', sep='|', header=None, names=user_cols)
# examine the first 5 rows
users.head()
    
    Out[7]:
In [7]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_table('data/ufo.csv', sep=',')
    
In [8]:
    
# read_csv is equivalent to read_table, except it assumes a comma separator
ufo = pd.read_csv('data/ufo.csv')
type(ufo)
    
    Out[8]:
In [9]:
    
# examine the first 5 rows
ufo.head()
    
    Out[9]:
In [10]:
    
# select the 'City' Series using bracket notation
ufo['Colors Reported']
    
    Out[10]:
In [16]:
    
type(ufo['City'])
    
    Out[16]:
In [17]:
    
# or equivalently, use dot notation - see notes below
ufo.City
    
    Out[17]:
Bracket notation will always work, whereas dot notation has limitations:
In [10]:
    
# create a new 'Location' Series (must use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.head()
    
    Out[10]:
In [11]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
    
Methods end with parentheses, while attributes don't:
In [12]:
    
# example method: show the first 5 rows
movies.head()
    
    Out[12]:
In [13]:
    
# example method: calculate summary statistics
movies.describe()
    
    Out[13]:
In [14]:
    
# example attribute: number of rows and columns
movies.shape
    
    Out[14]:
In [15]:
    
# example attribute: data type of each column
movies.dtypes
    
    Out[15]:
In [16]:
    
# use an optional parameter to the describe method to summarize only 'object' columns
movies.describe(include=['object'])
    
    Out[16]:
Documentation for describe
In [11]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
    
In [12]:
    
# examine the column names
ufo.columns
    
    Out[12]:
In [13]:
    
# rename two of the columns by using the 'rename' method
ufo.rename(columns={'Colors Reported':'Colors_Reported', 'Shape Reported':'Shape_Reported'}, inplace=True)
ufo.columns
    
    Out[13]:
Documentation for rename
In [22]:
    
# replace all of the column names by overwriting the 'columns' attribute
ufo_cols = ['city', 'colors reported', 'shape reported', 'state', 'time']
ufo.columns = ufo_cols
ufo.columns
    
    Out[22]:
In [23]:
    
# replace the column names during the file reading process by using the 'names' parameter
ufo = pd.read_csv('data/ufo.csv', header=0, names=ufo_cols)
ufo.head()
    
    Out[23]:
Documentation for read_csv
In [24]:
    
# replace all spaces with underscores in the column names by using the 'str.replace' method
ufo.columns = ufo.columns.str.replace(' ', '_')
ufo.columns
    
    Out[24]:
Documentation for str.replace
In [25]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head()
    
    Out[25]:
In [24]:
    
# remove a single column (axis=1 refers to columns)
ufo.drop('Colors Reported', axis=1, inplace=True)
ufo.head()
    
    Out[24]:
Documentation for drop
In [25]:
    
# remove multiple columns at once
ufo.drop(['City', 'State'], axis=1, inplace=True)
ufo.head()
    
    Out[25]:
In [26]:
    
# remove multiple rows at once (axis=0 refers to rows)
ufo.drop([0, 1], axis=0, inplace=True)
ufo.head()
    
    Out[26]:
In [28]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
    
    Out[28]:
Note: None of the sorting methods below affect the underlying data. (In other words, the sorting is temporary).
In [29]:
    
# sort the 'title' Series in ascending order (returns a Series)
movies.title.sort_values()
    
    Out[29]:
In [29]:
    
# sort in descending order instead
movies.title.sort_values(ascending=False).head()
    
    Out[29]:
Documentation for sort_values for a Series. (Prior to version 0.17, use order instead.)
In [31]:
    
# sort the entire DataFrame by the 'title' Series (returns a DataFrame)
movies.sort_values('title').head()
    
    Out[31]:
In [33]:
    
# sort in descending order instead
movies.sort_values('title', ascending=False).head()
    
    Out[33]:
Documentation for sort_values for a DataFrame. (Prior to version 0.17, use sort instead.)
In [32]:
    
# sort the DataFrame first by 'content_rating', then by 'duration'
movies.sort_values(['content_rating', 'duration']).head()
    
    Out[32]:
Summary of changes to the sorting API in pandas 0.17
In [35]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
    
    Out[35]:
In [34]:
    
# examine the number of rows and columns
movies.shape
    
    Out[34]:
Goal: Filter the DataFrame rows to only show movies with a 'duration' of at least 200 minutes.
In [35]:
    
# create a list in which each element refers to a DataFrame row: True if the row satisfies the condition, False otherwise
booleans = []
for length in movies.duration:
    if length >= 200:
        booleans.append(True)
    else:
        booleans.append(False)
    
In [36]:
    
# confirm that the list has the same length as the DataFrame
len(booleans)
    
    Out[36]:
In [37]:
    
# examine the first five list elements
booleans[0:5]
    
    Out[37]:
In [38]:
    
# convert the list to a Series
is_long = pd.Series(booleans)
is_long.head()
    
    Out[38]:
In [39]:
    
# use bracket notation with the boolean Series to tell the DataFrame which rows to display
movies[is_long]
    
    Out[39]:
In [40]:
    
# simplify the steps above: no need to write a for loop to create 'is_long' since pandas will broadcast the comparison
is_long = movies.duration >= 200
movies[is_long]
# or equivalently, write it in one line (no need to create the 'is_long' object)
movies[movies.duration >= 200]
    
    Out[40]:
In [41]:
    
# select the 'genre' Series from the filtered DataFrame
is_long = movies.duration >= 200
movies[is_long].genre
# or equivalently, use the 'loc' method
movies.loc[movies.duration >= 200, 'genre']
    
    Out[41]:
Documentation for loc
In [36]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imdb_1000.csv')
movies.head()
    
    Out[36]:
In [37]:
    
# filter the DataFrame to only show movies with a 'duration' of at least 200 minutes
movies[movies.duration >= 200]
    
    Out[37]:
Understanding logical operators:
and: True only if both sides of the operator are Trueor: True if either side of the operator is True
In [44]:
    
# demonstration of the 'and' operator
print(True and True)
print(True and False)
print(False and False)
    
    
In [45]:
    
# demonstration of the 'or' operator
print(True or True)
print(True or False)
print(False or False)
    
    
Rules for specifying multiple filter criteria in pandas:
& instead of and| instead of orGoal: Further filter the DataFrame of long movies (duration >= 200) to only show movies which also have a 'genre' of 'Drama'
In [46]:
    
# CORRECT: use the '&' operator to specify that both conditions are required
movies[(movies.duration >=200) & (movies.genre == 'Drama')]
    
    Out[46]:
In [47]:
    
# INCORRECT: using the '|' operator would have shown movies that are either long or dramas (or both)
movies[(movies.duration >=200) | (movies.genre == 'Drama')].head()
    
    Out[47]:
Goal: Filter the original DataFrame to show movies with a 'genre' of 'Crime' or 'Drama' or 'Action'
In [44]:
    
# use the '|' operator to specify that a row can match any of the three criteria
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].tail(20)
# or equivalently, use the 'isin' method
#movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head(10)
    
    Out[44]:
Documentation for isin
Question: When reading from a file, how do I read in only a subset of the columns?
In [49]:
    
# read a dataset of UFO reports into a DataFrame, and check the columns
ufo = pd.read_csv('data/ufo.csv')
ufo.columns
    
    Out[49]:
In [50]:
    
# specify which columns to include by name
ufo = pd.read_csv('data/ufo.csv', usecols=['City', 'State'])
# or equivalently, specify columns by position
ufo = pd.read_csv('data/ufo.csv', usecols=[0, 4])
ufo.columns
    
    Out[50]:
Question: When reading from a file, how do I read in only a subset of the rows?
In [51]:
    
# specify how many rows to read
ufo = pd.read_csv('data/ufo.csv', nrows=3)
ufo
    
    Out[51]:
Documentation for read_csv
Question: How do I iterate through a Series?
In [52]:
    
# Series are directly iterable (like a list)
for c in ufo.City:
    print(c)
    
    
Question: How do I iterate through a DataFrame?
In [53]:
    
# various methods are available to iterate through a DataFrame
for index, row in ufo.iterrows():
    print(index, row.City, row.State)
    
    
Documentation for iterrows
Question: How do I drop all non-numeric columns from a DataFrame?
In [45]:
    
# read a dataset of alcohol consumption into a DataFrame, and check the data types
drinks = pd.read_csv('data/drinks.csv')
drinks.dtypes
    
    Out[45]:
In [55]:
    
# only include numeric columns in the DataFrame
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes
    
    Out[55]:
Documentation for select_dtypes
Question: How do I know whether I should pass an argument as a string or a list?
In [56]:
    
# describe all of the numeric columns
drinks.describe()
    
    Out[56]:
In [57]:
    
# pass the string 'all' to describe all columns
drinks.describe(include='all')
    
    Out[57]:
In [58]:
    
# pass a list of data types to only describe certain types
drinks.describe(include=['object', 'float64'])
    
    Out[58]:
In [59]:
    
# pass a list even if you only want to describe a single data type
drinks.describe(include=['object'])
    
    Out[59]:
Documentation for describe
In [60]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
    
    Out[60]:
In [61]:
    
# drop a column (temporarily)
drinks.drop('continent', axis=1).head()
    
    Out[61]:
Documentation for drop
In [62]:
    
# drop a row (temporarily)
drinks.drop(2, axis=0).head()
    
    Out[62]:
When referring to rows or columns with the axis parameter:
In [46]:
    
# calculate the mean of each numeric column
drinks.mean()
# or equivalently, specify the axis explicitly
drinks.mean(axis=0)
    
    Out[46]:
Documentation for mean
In [64]:
    
# calculate the mean of each row
drinks.mean(axis=1).head()
    
    Out[64]:
When performing a mathematical operation with the axis parameter:
In [65]:
    
# 'index' is an alias for axis 0
drinks.mean(axis='index')
    
    Out[65]:
In [66]:
    
# 'columns' is an alias for axis 1
drinks.mean(axis='columns').head()
    
    Out[66]:
In [67]:
    
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('data/chipotle.tsv')
orders.head()
    
    Out[67]:
In [68]:
    
# normal way to access string methods in Python
'hello'.upper()
    
    Out[68]:
In [69]:
    
# string methods for pandas Series are accessed via 'str'
orders.item_name.str.upper().head()
    
    Out[69]:
In [70]:
    
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
    
    Out[70]:
In [71]:
    
# use the boolean Series to filter the DataFrame
orders[orders.item_name.str.contains('Chicken')].head()
    
    Out[71]:
In [72]:
    
# string methods can be chained together
orders.choice_description.str.replace('[', '').str.replace(']', '').head()
    
    Out[72]:
In [73]:
    
# many pandas string methods support regular expressions (regex)
orders.choice_description.str.replace('[\[\]]', '').head()
    
    Out[73]:
String handling section of the pandas API reference
In [74]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
    
    Out[74]:
In [75]:
    
# examine the data type of each Series
drinks.dtypes
    
    Out[75]:
In [76]:
    
# change the data type of an existing Series
drinks['beer_servings'] = drinks.beer_servings.astype(float)
drinks.dtypes
    
    Out[76]:
Documentation for astype
In [77]:
    
# alternatively, change the data type of a Series while reading in a file
drinks = pd.read_csv('data/drinks.csv', dtype={'beer_servings':float})
drinks.dtypes
    
    Out[77]:
In [78]:
    
# read a dataset of Chipotle orders into a DataFrame
orders = pd.read_table('data/chipotle.tsv')
orders.head()
    
    Out[78]:
In [79]:
    
# examine the data type of each Series
orders.dtypes
    
    Out[79]:
In [80]:
    
# convert a string to a number in order to do math
orders.item_price.str.replace('$', '').astype(float).mean()
    
    Out[80]:
In [81]:
    
# string method 'contains' checks for a substring and returns a boolean Series
orders.item_name.str.contains('Chicken').head()
    
    Out[81]:
In [82]:
    
# convert a boolean Series to an integer (False = 0, True = 1)
orders.item_name.str.contains('Chicken').astype(int).head()
    
    Out[82]:
In [83]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.tsv')
drinks.head()
    
    Out[83]:
In [84]:
    
# calculate the mean beer servings across the entire dataset
drinks.beer_servings.mean()
    
    Out[84]:
In [85]:
    
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()
    
    Out[85]:
In [86]:
    
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()
    
    Out[86]:
Documentation for groupby
In [87]:
    
# other aggregation functions (such as 'max') can also be used with groupby
drinks.groupby('continent').beer_servings.max()
    
    Out[87]:
In [88]:
    
# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])
    
    Out[88]:
Documentation for agg
In [89]:
    
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()
    
    Out[89]:
In [90]:
    
# allow plots to appear in the notebook
%matplotlib inline
    
In [91]:
    
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')
    
    Out[91]:
    
Documentation for plot
In [92]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('data/imbd_1000.csv')
movies.head()
    
    Out[92]:
In [93]:
    
# examine the data type of each Series
movies.dtypes
    
    Out[93]:
Exploring a non-numeric Series:
In [94]:
    
# count the non-null values, unique values, and frequency of the most common value
movies.genre.describe()
    
    Out[94]:
Documentation for describe
In [95]:
    
# count how many times each value in the Series occurs
movies.genre.value_counts()
    
    Out[95]:
Documentation for value_counts
In [96]:
    
# display percentages instead of raw counts
movies.genre.value_counts(normalize=True)
    
    Out[96]:
In [97]:
    
# 'value_counts' (like many pandas methods) outputs a Series
type(movies.genre.value_counts())
    
    Out[97]:
In [98]:
    
# thus, you can add another Series method on the end
movies.genre.value_counts().head()
    
    Out[98]:
In [99]:
    
# display the unique values in the Series
movies.genre.unique()
    
    Out[99]:
In [100]:
    
# count the number of unique values in the Series
movies.genre.nunique()
    
    Out[100]:
In [101]:
    
# compute a cross-tabulation of two Series
pd.crosstab(movies.genre, movies.content_rating)
    
    Out[101]:
Documentation for crosstab
Exploring a numeric Series:
In [102]:
    
# calculate various summary statistics
movies.duration.describe()
    
    Out[102]:
In [103]:
    
# many statistics are implemented as Series methods
movies.duration.mean()
    
    Out[103]:
Documentation for mean
In [104]:
    
# 'value_counts' is primarily useful for categorical data, not numerical data
movies.duration.value_counts().head()
    
    Out[104]:
In [105]:
    
# allow plots to appear in the notebook
%matplotlib inline
    
In [106]:
    
# histogram of the 'duration' Series (shows the distribution of a numerical variable)
movies.duration.plot(kind='hist')
    
    Out[106]:
    
In [107]:
    
# bar plot of the 'value_counts' for the 'genre' Series
movies.genre.value_counts().plot(kind='bar')
    
    Out[107]:
    
Documentation for plot
In [108]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.tail()
    
    Out[108]:
What does "NaN" mean?
numpy.nan.read_csv detects missing values (by default) when reading the file, and replaces them with this special value.Documentation for read_csv
In [109]:
    
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
ufo.isnull().tail()
    
    Out[109]:
In [110]:
    
# 'nonnull' returns the opposite of 'isnull' (True if not missing, False if missing)
ufo.notnull().tail()
    
    Out[110]:
In [111]:
    
# count the number of missing values in each Series
ufo.isnull().sum()
    
    Out[111]:
This calculation works because:
sum method for a DataFrame operates on axis=0 by default (and thus produces column sums).True to 1 and False to 0.
In [112]:
    
# use the 'isnull' Series method to filter the DataFrame rows
ufo[ufo.City.isnull()].head()
    
    Out[112]:
How to handle missing values depends on the dataset as well as the nature of your analysis. Here are some options:
In [113]:
    
# examine the number of rows and columns
ufo.shape
    
    Out[113]:
In [114]:
    
# if 'any' values are missing in a row, then drop that row
ufo.dropna(how='any').shape
    
    Out[114]:
Documentation for dropna
In [115]:
    
# 'inplace' parameter for 'dropna' is False by default, thus rows were only dropped temporarily
ufo.shape
    
    Out[115]:
In [116]:
    
# if 'all' values are missing in a row, then drop that row (none are dropped in this case)
ufo.dropna(how='all').shape
    
    Out[116]:
In [117]:
    
# if 'any' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape
    
    Out[117]:
In [118]:
    
# if 'all' values are missing in a row (considering only 'City' and 'Shape Reported'), then drop that row
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape
    
    Out[118]:
In [119]:
    
# 'value_counts' does not include missing values by default
ufo['Shape Reported'].value_counts().head()
    
    Out[119]:
In [120]:
    
# explicitly include missing values
ufo['Shape Reported'].value_counts(dropna=False).head()
    
    Out[120]:
Documentation for value_counts
In [121]:
    
# fill in missing values with a specified value
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
    
Documentation for fillna
In [122]:
    
# confirm that the missing values were filled in
ufo['Shape Reported'].value_counts().head()
    
    Out[122]:
In [123]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
    
    Out[123]:
In [124]:
    
# every DataFrame has an index (sometimes called the "row labels")
drinks.index
    
    Out[124]:
In [125]:
    
# column names are also stored in a special "index" object
drinks.columns
    
    Out[125]:
In [126]:
    
# neither the index nor the columns are included in the shape
drinks.shape
    
    Out[126]:
In [127]:
    
# index and columns both default to integers if you don't define them
pd.read_table('data/imbd_1000.csv', header=None, sep='|').head()
    
    Out[127]:
What is the index used for?
In [128]:
    
# identification: index remains with each row when filtering the DataFrame
drinks[drinks.continent=='South America']
    
    Out[128]:
In [129]:
    
# selection: select a portion of the DataFrame using the index
drinks.loc[23, 'beer_servings']
    
    Out[129]:
Documentation for loc
In [130]:
    
# set an existing column as the index
drinks.set_index('country', inplace=True)
drinks.head()
    
    Out[130]:
Documentation for set_index
In [131]:
    
# 'country' is now the index
drinks.index
    
    Out[131]:
In [132]:
    
# 'country' is no longer a column
drinks.columns
    
    Out[132]:
In [133]:
    
# 'country' data is no longer part of the DataFrame contents
drinks.shape
    
    Out[133]:
In [134]:
    
# country name can now be used for selection
drinks.loc['Brazil', 'beer_servings']
    
    Out[134]:
In [135]:
    
# index name is optional
drinks.index.name = None
drinks.head()
    
    Out[135]:
In [136]:
    
# restore the index name, and move the index back to a column
drinks.index.name = 'country'
drinks.reset_index(inplace=True)
drinks.head()
    
    Out[136]:
Documentation for reset_index
In [137]:
    
# many DataFrame methods output a DataFrame
drinks.describe()
    
    Out[137]:
In [138]:
    
# you can interact with any DataFrame using its index and columns
drinks.describe().loc['25%', 'beer_servings']
    
    Out[138]:
In [139]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
    
    Out[139]:
In [140]:
    
# every DataFrame has an index
drinks.index
    
    Out[140]:
In [141]:
    
# every Series also has an index (which carries over from the DataFrame)
drinks.continent.head()
    
    Out[141]:
In [142]:
    
# set 'country' as the index
drinks.set_index('country', inplace=True)
    
Documentation for set_index
In [143]:
    
# Series index is on the left, values are on the right
drinks.continent.head()
    
    Out[143]:
In [144]:
    
# another example of a Series (output from the 'value_counts' method)
drinks.continent.value_counts()
    
    Out[144]:
Documentation for value_counts
In [145]:
    
# access the Series index
drinks.continent.value_counts().index
    
    Out[145]:
In [146]:
    
# access the Series values
drinks.continent.value_counts().values
    
    Out[146]:
In [147]:
    
# elements in a Series can be selected by index (using bracket notation)
drinks.continent.value_counts()['Africa']
    
    Out[147]:
In [148]:
    
# any Series can be sorted by its values
drinks.continent.value_counts().sort_values()
    
    Out[148]:
In [149]:
    
# any Series can also be sorted by its index
drinks.continent.value_counts().sort_index()
    
    Out[149]:
Documentation for sort_values and sort_index
What is the index used for?
In [150]:
    
# 'beer_servings' Series contains the average annual beer servings per person
drinks.beer_servings.head()
    
    Out[150]:
In [151]:
    
# create a Series containing the population of two countries
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name='population')
people
    
    Out[151]:
Documentation for Series
In [152]:
    
# calculate the total annual beer servings for each country
(drinks.beer_servings * people).head()
    
    Out[152]:
In [153]:
    
# concatenate the 'drinks' DataFrame with the 'population' Series (aligns by the index)
pd.concat([drinks, people], axis=1).head()
    
    Out[153]:
In [154]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head(3)
    
    Out[154]:
The loc method is used to select rows and columns by label. You can pass it:
In [155]:
    
# row 0, all columns
ufo.loc[0, :]
    
    Out[155]:
In [156]:
    
# rows 0 and 1 and 2, all columns
ufo.loc[[0, 1, 2], :]
    
    Out[156]:
In [157]:
    
# rows 0 through 2 (inclusive), all columns
ufo.loc[0:2, :]
    
    Out[157]:
In [158]:
    
# this implies "all columns", but explicitly stating "all columns" is better
ufo.loc[0:2]
    
    Out[158]:
In [159]:
    
# rows 0 through 2 (inclusive), column 'City'
ufo.loc[0:2, 'City']
    
    Out[159]:
In [160]:
    
# rows 0 through 2 (inclusive), columns 'City' and 'State'
ufo.loc[0:2, ['City', 'State']]
    
    Out[160]:
In [161]:
    
# accomplish the same thing using double brackets - but using 'loc' is preferred since it's more explicit
ufo[['City', 'State']].head(3)
    
    Out[161]:
In [162]:
    
# rows 0 through 2 (inclusive), columns 'City' through 'State' (inclusive)
ufo.loc[0:2, 'City':'State']
    
    Out[162]:
In [163]:
    
# accomplish the same thing using 'head' and 'drop'
ufo.head(3).drop('Time', axis=1)
    
    Out[163]:
In [164]:
    
# rows in which the 'City' is 'Oakland', column 'State'
ufo.loc[ufo.City=='Oakland', 'State']
    
    Out[164]:
In [165]:
    
# accomplish the same thing using "chained indexing" - but using 'loc' is preferred since chained indexing can cause problems
ufo[ufo.City=='Oakland'].State
    
    Out[165]:
The iloc method is used to select rows and columns by integer position. You can pass it:
In [166]:
    
# rows in positions 0 and 1, columns in positions 0 and 3
ufo.iloc[[0, 1], [0, 3]]
    
    Out[166]:
In [167]:
    
# rows in positions 0 through 2 (exclusive), columns in positions 0 through 4 (exclusive)
ufo.iloc[0:2, 0:4]
    
    Out[167]:
In [168]:
    
# rows in positions 0 through 2 (exclusive), all columns
ufo.iloc[0:2, :]
    
    Out[168]:
In [169]:
    
# accomplish the same thing - but using 'iloc' is preferred since it's more explicit
ufo[0:2]
    
    Out[169]:
The ix method is used to select rows and columns by label or integer position, and should only be used when you need to mix label-based and integer-based selection in the same call.
In [170]:
    
# read a dataset of alcohol consumption into a DataFrame and set 'country' as the index
drinks = pd.read_csv('data/drinks.csv', index_col='country')
drinks.head()
    
    Out[170]:
In [171]:
    
# row with label 'Albania', column in position 0
drinks.ix['Albania', 0]
    
    Out[171]:
In [172]:
    
# row in position 1, column with label 'beer_servings'
drinks.ix[1, 'beer_servings']
    
    Out[172]:
Rules for using numbers with ix:
In [173]:
    
# rows 'Albania' through 'Andorra' (inclusive), columns in positions 0 through 2 (exclusive)
drinks.ix['Albania':'Andorra', 0:2]
    
    Out[173]:
In [174]:
    
# rows 0 through 2 (inclusive), columns in positions 0 through 2 (exclusive)
ufo.ix[0:2, 0:2]
    
    Out[174]:
In [175]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('data/ufo.csv')
ufo.head()
    
    Out[175]:
In [176]:
    
ufo.shape
    
    Out[176]:
In [177]:
    
# remove the 'City' column (doesn't affect the DataFrame since inplace=False)
ufo.drop('City', axis=1).head()
    
    Out[177]:
In [178]:
    
# confirm that the 'City' column was not actually removed
ufo.head()
    
    Out[178]:
In [179]:
    
# remove the 'City' column (does affect the DataFrame since inplace=True)
ufo.drop('City', axis=1, inplace=True)
    
In [180]:
    
# confirm that the 'City' column was actually removed
ufo.head()
    
    Out[180]:
In [181]:
    
# drop a row if any value is missing from that row (doesn't affect the DataFrame since inplace=False)
ufo.dropna(how='any').shape
    
    Out[181]:
In [182]:
    
# confirm that no rows were actually removed
ufo.shape
    
    Out[182]:
In [183]:
    
# use an assignment statement instead of the 'inplace' parameter
ufo = ufo.set_index('Time')
ufo.tail()
    
    Out[183]:
In [184]:
    
# fill missing values using "backward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='bfill').tail()
    
    Out[184]:
In [185]:
    
# compare with "forward fill" strategy (doesn't affect the DataFrame since inplace=False)
ufo.fillna(method='ffill').tail()
    
    Out[185]:
In [186]:
    
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('data/drinks.csv')
drinks.head()
    
    Out[186]:
In [187]:
    
# exact memory usage is unknown because object columns are references elsewhere
drinks.info()
    
    
In [188]:
    
# force pandas to calculate the true memory usage
drinks.info(memory_usage='deep')
    
    
In [189]:
    
# calculate the memory usage for each Series (in bytes)
drinks.memory_usage(deep=True)
    
    Out[189]:
Documentation for info and memory_usage
In [190]:
    
# use the 'category' data type (new in pandas 0.15) to store the 'continent' strings as integers
drinks['continent'] = drinks.continent.astype('category')
drinks.dtypes
    
    Out[190]:
In [191]:
    
# 'continent' Series appears to be unchanged
drinks.continent.head()
    
    Out[191]:
In [192]:
    
# strings are now encoded (0 means 'Africa', 1 means 'Asia', 2 means 'Europe', etc.)
drinks.continent.cat.codes.head()
    
    Out[192]:
In [193]:
    
# memory usage has been drastically reduced
drinks.memory_usage(deep=True)
    
    Out[193]:
In [194]:
    
# repeat this process for the 'country' Series
drinks['country'] = drinks.country.astype('category')
drinks.memory_usage(deep=True)
    
    Out[194]:
In [195]:
    
# memory usage increased because we created 193 categories
drinks.country.cat.categories
    
    Out[195]:
The category data type should only be used with a string Series that has a small number of possible values.
In [196]:
    
# create a small DataFrame from a dictionary
df = pd.DataFrame({'ID':[100, 101, 102, 103], 'quality':['good', 'very good', 'good', 'excellent']})
df
    
    Out[196]:
In [197]:
    
# sort the DataFrame by the 'quality' Series (alphabetical order)
df.sort_values('quality')
    
    Out[197]:
In [198]:
    
# define a logical ordering for the categories
df['quality'] = df.quality.astype('category', categories=['good', 'very good', 'excellent'], ordered=True)
df.quality
    
    Out[198]:
In [199]:
    
# sort the DataFrame by the 'quality' Series (logical order)
df.sort_values('quality')
    
    Out[199]:
In [200]:
    
# comparison operators work with ordered categories
df.loc[df.quality > 'good', :]
    
    Out[200]:
In [201]:
    
# read the training dataset from Kaggle's Titanic competition into a DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
    
    Out[201]:
Goal: Predict passenger survival aboard the Titanic based on passenger attributes
In [202]:
    
# create a feature matrix 'X' by selecting two DataFrame columns
feature_cols = ['Pclass', 'Parch']
X = train.loc[:, feature_cols]
X.shape
    
    Out[202]:
In [203]:
    
# create a response vector 'y' by selecting a Series
y = train.Survived
y.shape
    
    Out[203]:
Note: There is no need to convert these pandas objects to NumPy arrays. scikit-learn will understand these objects as long as they are entirely numeric and the proper shapes.
In [204]:
    
# fit a classification model to the training data
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X, y)
    
    Out[204]:
Video series: Introduction to machine learning with scikit-learn
In [205]:
    
# read the testing dataset from Kaggle's Titanic competition into a DataFrame
test = pd.read_csv('http://bit.ly/kaggletest')
test.head()
    
    Out[205]:
In [206]:
    
# create a feature matrix from the testing data that matches the training data
X_new = test.loc[:, feature_cols]
X_new.shape
    
    Out[206]:
In [207]:
    
# use the fitted model to make predictions for the testing set observations
new_pred_class = logreg.predict(X_new)
    
In [208]:
    
# create a DataFrame of passenger IDs and testing set predictions
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).head()
    
    Out[208]:
Documentation for the DataFrame constructor
In [209]:
    
# ensure that PassengerID is the first column by setting it as the index
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').head()
    
    Out[209]:
In [210]:
    
# write the DataFrame to a CSV file that can be submitted to Kaggle
pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId').to_csv('sub.csv')
    
Documentation for to_csv
In [211]:
    
# save a DataFrame to disk ("pickle it")
train.to_pickle('train.pkl')
    
In [212]:
    
# read a pickled object from disk ("unpickle it")
pd.read_pickle('train.pkl').head()
    
    Out[212]:
Documentation for to_pickle and read_pickle
Question: Could you explain how to read the pandas documentation?
Question: What is the difference between ufo.isnull() and pd.isnull(ufo)?
In [213]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
    
    Out[213]:
In [214]:
    
# use 'isnull' as a top-level function
pd.isnull(ufo).head()
    
    Out[214]:
In [215]:
    
# equivalent: use 'isnull' as a DataFrame method
ufo.isnull().head()
    
    Out[215]:
Documentation for isnull
Question: Why are DataFrame slices inclusive when using .loc, but exclusive when using .iloc?
In [216]:
    
# label-based slicing is inclusive of the start and stop
ufo.loc[0:4, :]
    
    Out[216]:
In [217]:
    
# position-based slicing is inclusive of the start and exclusive of the stop
ufo.iloc[0:4, :]
    
    Out[217]:
In [218]:
    
# 'iloc' is simply following NumPy's slicing convention...
ufo.values[0:4, :]
    
    Out[218]:
In [219]:
    
# ...and NumPy is simply following Python's slicing convention
'python'[0:4]
    
    Out[219]:
In [220]:
    
# 'loc' is inclusive of the stopping label because you don't necessarily know what label will come after it
ufo.loc[0:4, 'City':'State']
    
    Out[220]:
Question: How do I randomly sample rows from a DataFrame?
In [221]:
    
# sample 3 rows from the DataFrame without replacement (new in pandas 0.16.1)
ufo.sample(n=3)
    
    Out[221]:
Documentation for sample
In [222]:
    
# use the 'random_state' parameter for reproducibility
ufo.sample(n=3, random_state=42)
    
    Out[222]:
In [223]:
    
# sample 75% of the DataFrame's rows without replacement
train = ufo.sample(frac=0.75, random_state=99)
    
In [224]:
    
# store the remaining 25% of the rows in another DataFrame
test = ufo.loc[~ufo.index.isin(train.index), :]
    
Documentation for isin
In [225]:
    
# read the training dataset from Kaggle's Titanic competition
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
    
    Out[225]:
In [226]:
    
# create the 'Sex_male' dummy variable using the 'map' method
train['Sex_male'] = train.Sex.map({'female':0, 'male':1})
train.head()
    
    Out[226]:
Documentation for map
In [227]:
    
# alternative: use 'get_dummies' to create one column for every possible value
pd.get_dummies(train.Sex).head()
    
    Out[227]:
Generally speaking:
In [228]:
    
# drop the first dummy variable ('female') using the 'iloc' method
pd.get_dummies(train.Sex).iloc[:, 1:].head()
    
    Out[228]:
In [229]:
    
# add a prefix to identify the source of the dummy variables
pd.get_dummies(train.Sex, prefix='Sex').iloc[:, 1:].head()
    
    Out[229]:
In [230]:
    
# use 'get_dummies' with a feature that has 3 possible values
pd.get_dummies(train.Embarked, prefix='Embarked').head(10)
    
    Out[230]:
In [231]:
    
# drop the first dummy variable ('C')
pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:].head(10)
    
    Out[231]:
How to translate these values back to the original 'Embarked' value:
In [232]:
    
# save the DataFrame of dummy variables and concatenate them to the original DataFrame
embarked_dummies = pd.get_dummies(train.Embarked, prefix='Embarked').iloc[:, 1:]
train = pd.concat([train, embarked_dummies], axis=1)
train.head()
    
    Out[232]:
Documentation for concat
In [233]:
    
# reset the DataFrame
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()
    
    Out[233]:
In [234]:
    
# pass the DataFrame to 'get_dummies' and specify which columns to dummy (it drops the original columns)
pd.get_dummies(train, columns=['Sex', 'Embarked']).head()
    
    Out[234]:
In [235]:
    
# use the 'drop_first' parameter (new in pandas 0.18) to drop the first dummy variable for each feature
pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True).head()
    
    Out[235]:
Documentation for get_dummies
In [236]:
    
# read a dataset of UFO reports into a DataFrame
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()
    
    Out[236]:
In [237]:
    
# 'Time' is currently stored as a string
ufo.dtypes
    
    Out[237]:
In [238]:
    
# hour could be accessed using string slicing, but this approach breaks too easily
ufo.Time.str.slice(-5, -3).astype(int).head()
    
    Out[238]:
In [239]:
    
# convert 'Time' to datetime format
ufo['Time'] = pd.to_datetime(ufo.Time)
ufo.head()
    
    Out[239]:
In [240]:
    
ufo.dtypes
    
    Out[240]:
Documentation for to_datetime
In [241]:
    
# convenient Series attributes are now available
ufo.Time.dt.hour.head()
    
    Out[241]:
In [242]:
    
ufo.Time.dt.weekday_name.head()
    
    Out[242]:
In [243]:
    
ufo.Time.dt.dayofyear.head()
    
    Out[243]:
API reference for datetime properties and methods
In [244]:
    
# convert a single string to datetime format (outputs a timestamp object)
ts = pd.to_datetime('1/1/1999')
ts
    
    Out[244]:
In [245]:
    
# compare a datetime Series with a timestamp
ufo.loc[ufo.Time >= ts, :].head()
    
    Out[245]:
In [246]:
    
# perform mathematical operations with timestamps (outputs a timedelta object)
ufo.Time.max() - ufo.Time.min()
    
    Out[246]:
In [247]:
    
# timedelta objects also have attributes you can access
(ufo.Time.max() - ufo.Time.min()).days
    
    Out[247]:
In [248]:
    
# allow plots to appear in the notebook
%matplotlib inline
    
In [249]:
    
# count the number of UFO reports per year
ufo['Year'] = ufo.Time.dt.year
ufo.Year.value_counts().sort_index().head()
    
    Out[249]:
In [250]:
    
# plot the number of UFO reports per year (line plot is the default)
ufo.Year.value_counts().sort_index().plot()
    
    Out[250]:
    
In [251]:
    
# read a dataset of movie reviewers into a DataFrame
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()
    
    Out[251]:
In [252]:
    
users.shape
    
    Out[252]:
In [253]:
    
# detect duplicate zip codes: True if an item is identical to a previous item
users.zip_code.duplicated().tail()
    
    Out[253]:
In [254]:
    
# count the duplicate items (True becomes 1, False becomes 0)
users.zip_code.duplicated().sum()
    
    Out[254]:
In [255]:
    
# detect duplicate DataFrame rows: True if an entire row is identical to a previous row
users.duplicated().tail()
    
    Out[255]:
In [256]:
    
# count the duplicate rows
users.duplicated().sum()
    
    Out[256]:
Logic for duplicated:
keep='first' (default): Mark duplicates as True except for the first occurrence.keep='last': Mark duplicates as True except for the last occurrence.keep=False: Mark all duplicates as True.
In [257]:
    
# examine the duplicate rows (ignoring the first occurrence)
users.loc[users.duplicated(keep='first'), :]
    
    Out[257]:
In [258]:
    
# examine the duplicate rows (ignoring the last occurrence)
users.loc[users.duplicated(keep='last'), :]
    
    Out[258]:
In [259]:
    
# examine the duplicate rows (including all duplicates)
users.loc[users.duplicated(keep=False), :]
    
    Out[259]:
In [260]:
    
# drop the duplicate rows (inplace=False by default)
users.drop_duplicates(keep='first').shape
    
    Out[260]:
In [261]:
    
users.drop_duplicates(keep='last').shape
    
    Out[261]:
In [262]:
    
users.drop_duplicates(keep=False).shape
    
    Out[262]:
Documentation for drop_duplicates
In [263]:
    
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()
    
    Out[263]:
In [264]:
    
users.drop_duplicates(subset=['age', 'zip_code']).shape
    
    Out[264]:
In [265]:
    
# read a dataset of top-rated IMDb movies into a DataFrame
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()
    
    Out[265]:
In [266]:
    
# count the missing values in the 'content_rating' Series
movies.content_rating.isnull().sum()
    
    Out[266]:
In [267]:
    
# examine the DataFrame rows that contain those missing values
movies[movies.content_rating.isnull()]
    
    Out[267]:
In [268]:
    
# examine the unique values in the 'content_rating' Series
movies.content_rating.value_counts()
    
    Out[268]:
Goal: Mark the 'NOT RATED' values as missing values, represented by 'NaN'.
In [269]:
    
# first, locate the relevant rows
movies[movies.content_rating=='NOT RATED'].head()
    
    Out[269]:
In [270]:
    
# then, select the 'content_rating' Series from those rows
movies[movies.content_rating=='NOT RATED'].content_rating.head()
    
    Out[270]:
In [271]:
    
# finally, replace the 'NOT RATED' values with 'NaN' (imported from NumPy)
import numpy as np
movies[movies.content_rating=='NOT RATED'].content_rating = np.nan
    
    
Problem: That statement involves two operations, a __getitem__ and a __setitem__. pandas can't guarantee whether the __getitem__ operation returns a view or a copy of the data.
__getitem__ returns a view of the data, __setitem__ will affect the 'movies' DataFrame.__getitem__ returns a copy of the data, __setitem__ will not affect the 'movies' DataFrame.
In [272]:
    
# the 'content_rating' Series has not changed
movies.content_rating.isnull().sum()
    
    Out[272]:
Solution: Use the loc method, which replaces the 'NOT RATED' values in a single __setitem__ operation.
In [273]:
    
# replace the 'NOT RATED' values with 'NaN' (does not cause a SettingWithCopyWarning)
movies.loc[movies.content_rating=='NOT RATED', 'content_rating'] = np.nan
    
In [274]:
    
# this time, the 'content_rating' Series has changed
movies.content_rating.isnull().sum()
    
    Out[274]:
Summary: Use the loc method any time you are selecting rows and columns in the same statement.
More information: Modern Pandas (Part 1)
In [275]:
    
# create a DataFrame only containing movies with a high 'star_rating'
top_movies = movies.loc[movies.star_rating >= 9, :]
top_movies
    
    Out[275]:
Goal: Fix the 'duration' for 'The Shawshank Redemption'.
In [276]:
    
# overwrite the relevant cell with the correct duration
top_movies.loc[0, 'duration'] = 150
    
    
Problem: pandas isn't sure whether 'top_movies' is a view or a copy of 'movies'.
In [277]:
    
# 'top_movies' DataFrame has been updated
top_movies
    
    Out[277]:
In [278]:
    
# 'movies' DataFrame has not been updated
movies.head(1)
    
    Out[278]:
Solution: Any time you are attempting to create a DataFrame copy, use the copy method.
In [279]:
    
# explicitly create a copy of 'movies'
top_movies = movies.loc[movies.star_rating >= 9, :].copy()
    
In [280]:
    
# pandas now knows that you are updating a copy instead of a view (does not cause a SettingWithCopyWarning)
top_movies.loc[0, 'duration'] = 150
    
In [281]:
    
# 'top_movies' DataFrame has been updated
top_movies
    
    Out[281]:
Documentation on indexing and selection: Returning a view versus a copy
Stack Overflow: What is the point of views in pandas if it is undefined whether an indexing operation returns a view or a copy?