In [1]:

    
# import necessary modules
import pandas as pd, numpy as np

Part 1
Reviewing the Basics



In [2]:

    
# a python list is a basic data type
li = [1, 2, 3, 4]
li









    Out[2]:





[1, 2, 3, 4]



In [3]:

    
# a numpy array is like a list, but faster and more compact
ar = np.array([1, 2, 3, 4])
ar









    Out[3]:





array([1, 2, 3, 4])



In [4]:

    
# you can create a numpy array from an existing list too
ar = np.array(li)
ar









    Out[4]:





array([1, 2, 3, 4])



In [5]:

    
# a pandas series is based on a numpy array - it's fast, compact, and has more functionality
se1 = pd.Series(ar)
se1









    Out[5]:





0    1
1    2
2    3
3    4
dtype: int32



In [6]:

    
# you can create a new Series by passing in a list variable or array
# series can contain data types other than just integers
se2 = pd.Series(['a', 'b', 'c', 'd'])
se2









    Out[6]:





0    a
1    b
2    c
3    d
dtype: object

the first "column" is an index, the second is the series of values



In [7]:

    
# you can change a series's index
se2.index = ['w', 'x', 'y', 'z']
se2









    Out[7]:





w    a
x    b
y    c
z    d
dtype: object



In [8]:

    
# a pandas dataframe is like a table where each column is a series
df = pd.DataFrame([1, 2, 3, 4])
df

this is a one-dimensional DataFrame... it's equivalent to a Series



In [9]:

    
# a dict can contain multiple lists and label them
di1 = {'column1':[1, 2, 3, 4], 'column2':[5, 6, 7, 8]}
di1









    Out[9]:





{'column1': [1, 2, 3, 4], 'column2': [5, 6, 7, 8]}



In [10]:

    
# a dict can also contain multiple lists/series and labels
di2 = {'column1':li, 'column2':se1}
di2









    Out[10]:





{'column1': [1, 2, 3, 4], 'column2': 0    1
 1    2
 2    3
 3    4
 dtype: int32}



In [11]:

    
# a pandas dataframe can contain multiple columns/series
# you can create a dataframe by passing in a list, array, series, or dict
df = pd.DataFrame(di1)
df

this is a two-dimensional DataFrame. Its index contains row labels (0, 1, 2, 3) and its columns are indexed by column labels



In [12]:

    
# the row labels in the index are accessed by the .index attribute of the DataFrame object
print(df.index)

# the column labels are accessed by the .columns attribute of the DataFrame object
print(df.columns)









    



RangeIndex(start=0, stop=4, step=1)
Index([u'column1', u'column2'], dtype='object')



In [13]:

    
# make sure your indices match!
di = {'column1':se1, 'column2':se2}
df = pd.DataFrame(di)
df



In [14]:

    
# numpy offers a useful datatype called NaN for null values - has to be floating point, not int
x = np.nan
print(x)
print(type(x))









    



nan
<type 'float'>

Part 2:
Working with CSV files



In [15]:

    
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df2 = pd.read_csv('data/pandas-test.csv')



In [16]:

    
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head()









    Out[16]:






  
    
      
      city
      state
    
  
  
    
      0
      san francisco
      california
    
    
      1
      phoenix
      arizona
    
    
      2
      seattle
      washington
    
    
      3
      dallas
      texas
    
    
      4
      denver
      colorado



In [17]:

    
# returns 5 rows by default, or you can pass the number of rows you want as an argument
df2.tail(3)









    Out[17]:






  
    
      
      city
      state
    
  
  
    
      5
      chicago
      illinois
    
    
      6
      portland
      oregon
    
    
      7
      miami
      florida



In [18]:

    
# you can add a new column to a DataFrame
df2['country'] = ''
df2









    Out[18]:






  
    
      
      city
      state
      country
    
  
  
    
      0
      san francisco
      california
      
    
    
      1
      phoenix
      arizona
      
    
    
      2
      seattle
      washington
      
    
    
      3
      dallas
      texas
      
    
    
      4
      denver
      colorado
      
    
    
      5
      chicago
      illinois
      
    
    
      6
      portland
      oregon
      
    
    
      7
      miami
      florida



In [19]:

    
# you can update the values of an entire column
df2['country'] = 'USA'
df2









    Out[19]:






  
    
      
      city
      state
      country
    
  
  
    
      0
      san francisco
      california
      USA
    
    
      1
      phoenix
      arizona
      USA
    
    
      2
      seattle
      washington
      USA
    
    
      3
      dallas
      texas
      USA
    
    
      4
      denver
      colorado
      USA
    
    
      5
      chicago
      illinois
      USA
    
    
      6
      portland
      oregon
      USA
    
    
      7
      miami
      florida
      USA



In [20]:

    
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2









    Out[20]:






  
    
      
      city
      state
      country
    
  
  
    
      0
      san francisco
      california
      USA
    
    
      1
      phoenix
      arizona
      United States
    
    
      2
      seattle
      washington
      USA
    
    
      3
      dallas
      texas
      United States
    
    
      4
      denver
      colorado
      USA
    
    
      5
      chicago
      illinois
      United States
    
    
      6
      portland
      oregon
      USA
    
    
      7
      miami
      florida
      United States



In [21]:

    
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].str.replace('United States', 'USA')
df2









    Out[21]:






  
    
      
      city
      state
      country
    
  
  
    
      0
      san francisco
      california
      USA
    
    
      1
      phoenix
      arizona
      United States
    
    
      2
      seattle
      washington
      USA
    
    
      3
      dallas
      texas
      United States
    
    
      4
      denver
      colorado
      USA
    
    
      5
      chicago
      illinois
      United States
    
    
      6
      portland
      oregon
      USA
    
    
      7
      miami
      florida
      United States

that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place



In [22]:

    
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2









    Out[22]:






  
    
      
      city
      state
      country
    
  
  
    
      0
      san francisco
      california
      USA
    
    
      1
      phoenix
      arizona
      USA
    
    
      2
      seattle
      washington
      USA
    
    
      3
      dallas
      texas
      USA
    
    
      4
      denver
      colorado
      USA
    
    
      5
      chicago
      illinois
      USA
    
    
      6
      portland
      oregon
      USA
    
    
      7
      miami
      florida
      USA



In [23]:

    
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2









    Out[23]:






  
    
      
      city_name
      state_name
      nation
    
  
  
    
      0
      san francisco
      california
      USA
    
    
      1
      phoenix
      arizona
      USA
    
    
      2
      seattle
      washington
      USA
    
    
      3
      dallas
      texas
      USA
    
    
      4
      denver
      colorado
      USA
    
    
      5
      chicago
      illinois
      USA
    
    
      6
      portland
      oregon
      USA
    
    
      7
      miami
      florida
      USA



In [24]:

    
# you can save your DataFrame as a csv file
df2.to_csv('data/my_data.csv')

Part 3:
Dropping and editing values



In [25]:

    
# there are lots of ways to create dataframes
list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]
df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])
df['country'] = 'USA'
df['continent'] = 'North America'
df









    Out[25]:






  
    
      
      city
      year
      country
      continent
    
  
  
    
      0
      sf
      2012.0
      USA
      North America
    
    
      1
      phx
      NaN
      USA
      North America
    
    
      2
      phx
      2005.0
      USA
      North America
    
    
      3
      chi
      2009.0
      USA
      North America



In [26]:

    
# you can remove a column from a dataframe with the .drop() method by referencing its label and axis
# axis 0 = rows
# axis 1 = columns
df2 = df.drop('country', axis=1, inplace=False)
df2









    Out[26]:






  
    
      
      city
      year
      continent
    
  
  
    
      0
      sf
      2012.0
      North America
    
    
      1
      phx
      NaN
      North America
    
    
      2
      phx
      2005.0
      North America
    
    
      3
      chi
      2009.0
      North America



In [27]:

    
# you can also remove a column from a dataframe with the del() function
df3 = pd.DataFrame(df2)
del(df3['continent'])
df3

del() is a function but .drop() is a method. The difference is that a method is performed on an object, in this case a DataFrame. A function is independent of an object - it's just a chunk of code that is called by name and allows data to be passed to it by arguments.



In [28]:

    
# you can use the len() function to check the row count of a DataFrame
len(df3)









    Out[28]:





4



In [29]:

    
# you can also use .count() method to check the row count, but this excludes NaNs
df3.count()









    Out[29]:





city    4
year    3
dtype: int64



In [30]:

    
# or you can use the .shape attribute to get the shape of the DataFrame
df3.shape









    Out[30]:





(4, 2)

An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()



In [31]:

    
# you can get a count of values that appear in some column
df3['city'].value_counts()









    Out[31]:





phx    2
chi    1
sf     1
Name: city, dtype: int64



In [32]:

    
# you can drop rows that contain duplicate values in some specified column
df4 = df3.drop_duplicates('city')
df4

look at the index above. remember that it's not a row counter, it's an index of row labels



In [33]:

    
# back to our earlier dataframe
df









    Out[33]:






  
    
      
      city
      year
      country
      continent
    
  
  
    
      0
      sf
      2012.0
      USA
      North America
    
    
      1
      phx
      NaN
      USA
      North America
    
    
      2
      phx
      2005.0
      USA
      North America
    
    
      3
      chi
      2009.0
      USA
      North America



In [34]:

    
# you can perform operations across an entire Series (aka column in our DataFrame) at once
df['year5'] = df['year'] + 5
df









    Out[34]:






  
    
      
      city
      year
      country
      continent
      year5
    
  
  
    
      0
      sf
      2012.0
      USA
      North America
      2017.0
    
    
      1
      phx
      NaN
      USA
      North America
      NaN
    
    
      2
      phx
      2005.0
      USA
      North America
      2010.0
    
    
      3
      chi
      2009.0
      USA
      North America
      2014.0



In [ ]:

	city	state
0	san francisco	california
1	phoenix	arizona
2	seattle	washington
3	dallas	texas
4	denver	colorado

	city	year	country	continent
0	sf	2012.0	USA	North America
1	phx	NaN	USA	North America
2	phx	2005.0	USA	North America
3	chi	2009.0	USA	North America

	city	year	country	continent	year5
0	sf	2012.0	USA	North America	2017.0
1	phx	NaN	USA	North America	NaN
2	phx	2005.0	USA	North America	2010.0
3	chi	2009.0	USA	North America	2014.0

Part 1 Reviewing the Basics

Part 2: Working with CSV files

Part 3:Dropping and editing values

Part 1
Reviewing the Basics

Part 2:
Working with CSV files

Part 3:
Dropping and editing values