In [1]:
# import necessary modules
import pandas as pd, numpy as np
In [2]:
# a python list is a basic data type
li = [1, 2, 3, 4]
li
Out[2]:
In [3]:
# a numpy array is like a list, but faster and more compact
ar = np.array([1, 2, 3, 4])
ar
Out[3]:
In [4]:
# you can create a numpy array from an existing list too
ar = np.array(li)
ar
Out[4]:
In [5]:
# a pandas series is based on a numpy array - it's fast, compact, and has more functionality
se1 = pd.Series(ar)
se1
Out[5]:
In [6]:
# you can create a new Series by passing in a list variable or array
# series can contain data types other than just integers
se2 = pd.Series(['a', 'b', 'c', 'd'])
se2
Out[6]:
the first "column" is an index, the second is the series of values
In [7]:
# you can change a series's index
se2.index = ['w', 'x', 'y', 'z']
se2
Out[7]:
In [8]:
# a pandas dataframe is like a table where each column is a series
df = pd.DataFrame([1, 2, 3, 4])
df
Out[8]:
this is a one-dimensional DataFrame... it's equivalent to a Series
In [9]:
# a dict can contain multiple lists and label them
di1 = {'column1':[1, 2, 3, 4], 'column2':[5, 6, 7, 8]}
di1
Out[9]:
In [10]:
# a dict can also contain multiple lists/series and labels
di2 = {'column1':li, 'column2':se1}
di2
Out[10]:
In [11]:
# a pandas dataframe can contain multiple columns/series
# you can create a dataframe by passing in a list, array, series, or dict
df = pd.DataFrame(di1)
df
Out[11]:
this is a two-dimensional DataFrame. Its index contains row labels (0, 1, 2, 3) and its columns are indexed by column labels
In [12]:
# the row labels in the index are accessed by the .index attribute of the DataFrame object
print(df.index)
# the column labels are accessed by the .columns attribute of the DataFrame object
print(df.columns)
In [13]:
# make sure your indices match!
di = {'column1':se1, 'column2':se2}
df = pd.DataFrame(di)
df
Out[13]:
In [14]:
# numpy offers a useful datatype called NaN for null values - has to be floating point, not int
x = np.nan
print(x)
print(type(x))
In [15]:
# pandas can load CSV files as DataFrames - it pulls column labels from the first row of the data file
df2 = pd.read_csv('data/pandas-test.csv')
In [16]:
# you can view the first few or the last few rows of a DataFrame with the .head() or .tail() methods
df2.head()
Out[16]:
In [17]:
# returns 5 rows by default, or you can pass the number of rows you want as an argument
df2.tail(3)
Out[17]:
In [18]:
# you can add a new column to a DataFrame
df2['country'] = ''
df2
Out[18]:
In [19]:
# you can update the values of an entire column
df2['country'] = 'USA'
df2
Out[19]:
In [20]:
# you can set the values of a column (aka, Series) in the DataFrame to a list of values
df2['country'] = ['USA', 'United States'] * 4
df2
Out[20]:
In [21]:
# you can use fast vectorized methods on a pandas series (aka, a column in our dataframe)
df2['country'].str.replace('United States', 'USA')
df2
Out[21]:
that didn't do anything to our dataframebecause .str.replace() returns the updated version - it doesn't perform the operation in place
In [22]:
# we need to capture the updated values when they get returned
df2['country'] = df2['country'].str.replace('United States', 'USA')
df2
Out[22]:
In [23]:
# you can change the column names
df2.columns = ['city_name', 'state_name', 'nation']
df2
Out[23]:
In [24]:
# you can save your DataFrame as a csv file
df2.to_csv('data/my_data.csv')
In [25]:
# there are lots of ways to create dataframes
list_of_tuples = [('sf', 2012), ('phx', np.nan), ('phx', 2005), ('chi', 2009)]
df = pd.DataFrame(list_of_tuples, columns=['city', 'year'])
df['country'] = 'USA'
df['continent'] = 'North America'
df
Out[25]:
In [26]:
# you can remove a column from a dataframe with the .drop() method by referencing its label and axis
# axis 0 = rows
# axis 1 = columns
df2 = df.drop('country', axis=1, inplace=False)
df2
Out[26]:
In [27]:
# you can also remove a column from a dataframe with the del() function
df3 = pd.DataFrame(df2)
del(df3['continent'])
df3
Out[27]:
del() is a function but .drop() is a method. The difference is that a method is performed on an object, in this case a DataFrame. A function is independent of an object - it's just a chunk of code that is called by name and allows data to be passed to it by arguments.
In [28]:
# you can use the len() function to check the row count of a DataFrame
len(df3)
Out[28]:
In [29]:
# you can also use .count() method to check the row count, but this excludes NaNs
df3.count()
Out[29]:
In [30]:
# or you can use the .shape attribute to get the shape of the DataFrame
df3.shape
Out[30]:
An attribute is different than a method or a function. Notice it doesn't use parentheses like .shape()
In [31]:
# you can get a count of values that appear in some column
df3['city'].value_counts()
Out[31]:
In [32]:
# you can drop rows that contain duplicate values in some specified column
df4 = df3.drop_duplicates('city')
df4
Out[32]:
look at the index above. remember that it's not a row counter, it's an index of row labels
In [33]:
# back to our earlier dataframe
df
Out[33]:
In [34]:
# you can perform operations across an entire Series (aka column in our DataFrame) at once
df['year5'] = df['year'] + 5
df
Out[34]:
In [ ]: