In [1]:
# import necessary modules
import pandas as pd, numpy as np

Part 1
Reviewing the Basics


In [2]:
# a python list is a basic data type
li = [1, 2, 3, 4]
li


Out[2]:
[1, 2, 3, 4]

In [3]:
# a numpy array is like a list, but faster and more compact
ar = np.array([1, 2, 3, 4])
ar


Out[3]:
array([1, 2, 3, 4])

In [4]:
# you can create a numpy array from an existing list too
ar = np.array(li)
ar


Out[4]:
array([1, 2, 3, 4])

In [5]:
# a pandas series is based on a numpy array - it's fast, compact, and has more functionality
se1 = pd.Series(ar)
se1


Out[5]:
0    1
1    2
2    3
3    4
dtype: int32

In [6]:
# you can create a new Series by passing in a list variable or array
# series can contain data types other than just integers
se2 = pd.Series(['a', 'b', 'c', 'd'])
se2


Out[6]:
0    a
1    b
2    c
3    d
dtype: object

the first "column" is an index, the second is the series of values


In [7]:
# you can change a series's index
se2.index = ['w', 'x', 'y', 'z']
se2


Out[7]:
w    a
x    b
y    c
z    d
dtype: object

In [8]:
# a pandas dataframe is like a table where each column is a series
df = pd.DataFrame([1, 2, 3, 4])
df


Out[8]:
0
0 1
1 2
2 3
3 4

this is a one-dimensional DataFrame... it's equivalent to a Series


In [9]:
# a dict can contain multiple lists and label them
di1 = {'column1':[1, 2, 3, 4], 'column2':[5, 6, 7, 8]}
di1


Out[9]:
{'column1': [1, 2, 3, 4], 'column2': [5, 6, 7, 8]}

In [10]:
# a dict can also contain multiple lists/series and labels
di2 = {'column1':li, 'column2':se1}
di2


Out[10]:
{'column1': [1, 2, 3, 4], 'column2': 0    1
 1    2
 2    3
 3    4
 dtype: int32}

In [11]:
# a pandas dataframe can contain multiple columns/series
# you can create a dataframe by passing in a list, array, series, or dict
df = pd.DataFrame(di1)
df


Out[11]:
column1 column2
0 1 5
1 2 6
2 3 7
3 4 8

this is a two-dimensional DataFrame. Its index contains row labels (0, 1, 2, 3) and its columns are indexed by column labels


In [12]:
# the row labels in the index are accessed by the .index attribute of the DataFrame object
print(df.index)

# the column labels are accessed by the .columns attribute of the DataFrame object
print(df.columns)


RangeIndex(start=0, stop=4, step=1)
Index([u'column1', u'column2'], dtype='object')

In [13]:
# make sure your indices match!
di = {'column1':se1, 'column2':se2}
df = pd.DataFrame(di)
df


Out[13]:
column1 column2
0 1.0 NaN
1 2.0 NaN
2 3.0 NaN
3 4.0 NaN
w NaN a
x NaN b
y NaN c
z NaN d

In [14]:
# numpy offers a useful datatype called NaN for null values - has to be floating point, not int
x = np.nan
print(x)
print(type(x))


nan
<type 'float'>

Part 2:
Working with CSV files


In [15]:
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df2 = pd.read_csv('data/pandas-test.csv')

In [16]:
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head()


Out[16]:
city state
0 san francisco california
1 phoenix arizona
2 seattle washington
3 dallas texas
4 denver colorado

In [17]:
# returns 5 rows by default, or you can pass the number of rows you want as an argument
df2.tail(3)


Out[17]:
city state
5 chicago illinois
6 portland oregon
7 miami florida

In [18]:
# you can add a new column to a DataFrame
df2['country'] = ''
df2


Out[18]:
city state country
0 san francisco california
1 phoenix arizona
2 seattle washington
3 dallas texas
4 denver colorado
5 chicago illinois
6 portland oregon
7 miami florida

In [19]:
# you can update the values of an entire column
df2['country'] = 'USA'
df2


Out[19]:
city state country
0 san francisco california USA
1 phoenix arizona USA
2 seattle washington USA
3 dallas texas USA
4 denver colorado USA
5 chicago illinois USA
6 portland oregon USA
7 miami florida USA

In [20]:
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2


Out[20]:
city state country
0 san francisco california USA
1 phoenix arizona United States
2 seattle washington USA
3 dallas texas United States
4 denver colorado USA
5 chicago illinois United States
6 portland oregon USA
7 miami florida United States

In [21]:
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].str.replace('United States', 'USA')
df2


Out[21]:
city state country
0 san francisco california USA
1 phoenix arizona United States
2 seattle washington USA
3 dallas texas United States
4 denver colorado USA
5 chicago illinois United States
6 portland oregon USA
7 miami florida United States

that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place


In [22]:
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2


Out[22]:
city state country
0 san francisco california USA
1 phoenix arizona USA
2 seattle washington USA
3 dallas texas USA
4 denver colorado USA
5 chicago illinois USA
6 portland oregon USA
7 miami florida USA

In [23]:
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2


Out[23]:
city_name state_name nation
0 san francisco california USA
1 phoenix arizona USA
2 seattle washington USA
3 dallas texas USA
4 denver colorado USA
5 chicago illinois USA
6 portland oregon USA
7 miami florida USA

In [24]:
# you can save your DataFrame as a csv file
df2.to_csv('data/my_data.csv')

Part 3:
Dropping and editing values


In [25]:
# there are lots of ways to create dataframes
list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]
df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])
df['country'] = 'USA'
df['continent'] = 'North America'
df


Out[25]:
city year country continent
0 sf 2012.0 USA North America
1 phx NaN USA North America
2 phx 2005.0 USA North America
3 chi 2009.0 USA North America

In [26]:
# you can remove a column from a dataframe with the .drop() method by referencing its label and axis
# axis 0 = rows
# axis 1 = columns
df2 = df.drop('country', axis=1, inplace=False)
df2


Out[26]:
city year continent
0 sf 2012.0 North America
1 phx NaN North America
2 phx 2005.0 North America
3 chi 2009.0 North America

In [27]:
# you can also remove a column from a dataframe with the del() function
df3 = pd.DataFrame(df2)
del(df3['continent'])
df3


Out[27]:
city year
0 sf 2012.0
1 phx NaN
2 phx 2005.0
3 chi 2009.0

del() is a function but .drop() is a method. The difference is that a method is performed on an object, in this case a DataFrame. A function is independent of an object - it's just a chunk of code that is called by name and allows data to be passed to it by arguments.


In [28]:
# you can use the len() function to check the row count of a DataFrame
len(df3)


Out[28]:
4

In [29]:
# you can also use .count() method to check the row count, but this excludes NaNs
df3.count()


Out[29]:
city    4
year    3
dtype: int64

In [30]:
# or you can use the .shape attribute to get the shape of the DataFrame
df3.shape


Out[30]:
(4, 2)

An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()


In [31]:
# you can get a count of values that appear in some column
df3['city'].value_counts()


Out[31]:
phx    2
chi    1
sf     1
Name: city, dtype: int64

In [32]:
# you can drop rows that contain duplicate values in some specified column
df4 = df3.drop_duplicates('city')
df4


Out[32]:
city year
0 sf 2012.0
1 phx NaN
3 chi 2009.0

look at the index above. remember that it's not a row counter, it's an index of row labels


In [33]:
# back to our earlier dataframe
df


Out[33]:
city year country continent
0 sf 2012.0 USA North America
1 phx NaN USA North America
2 phx 2005.0 USA North America
3 chi 2009.0 USA North America

In [34]:
# you can perform operations across an entire Series (aka column in our DataFrame) at once
df['year5'] = df['year'] + 5
df


Out[34]:
city year country continent year5
0 sf 2012.0 USA North America 2017.0
1 phx NaN USA North America NaN
2 phx 2005.0 USA North America 2010.0
3 chi 2009.0 USA North America 2014.0

In [ ]: