In [63]:
# 3.1: Data Frames - more at https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
In [5]:
# Import Pandas and configure it not to be in scientific notation
import pandas as pd
pd.set_option('display.float_format', '{:,.2f}'.format) # similar to options(scipen=999) in R
In [3]:
# 3.1.1: Fundamentals of data frames
In [64]:
# 3.1.1.1: Construct a data frame from a dictionary
x_dict = {'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}
df = pd.DataFrame(x_dict) # DataFrame class constructor from a dictionary
print(df) # 0, 1 is the index of the data frame
In [75]:
# 3.1.1.2: Construct a data frame from a comma-separated value (csv) file
converters = {'a': int, 'b': int, 'c': int}
df = pd.read_csv('files/test_in.csv', converters=converters) # use a sep='\t' for a tab-separated value (tsv) file
df
Out[75]:
In [76]:
# 3.1.1.3: Construct a data frame from a fixed-width file
df = pd.read_fwf('files/test_in.txt', widths=[3, 3, 3], converters=converters)
df
Out[76]:
In [77]:
# 3.1.1.4: Construct a data frame from an Excel file
df = pd.read_excel('files/test_in.xlsx', 'Sheet1', converters=converters) # need to install the xlrd (Excel read) module first
df
Out[77]:
In [80]:
# 3.1.1.5: Fundamental data frame properties
print(df.shape) # dimensions of the data frame i.e. # rows by # columns
print(df.columns) # the index of column names. dtype of object means that the column names are strings.
print(df.dtypes) # the data type of each column
print(df.index) # the row index of the data frame (similar to primary key in SQL)
In [79]:
# 3.1.1.6: Fundamental data frame methods
print(df.describe()) # summary statistics and percentiles of every numerical column
print(df.head(1)) # df.head(n) only select the top most n rows
print(df.tail(1)) # df.tail(n) only select the bottom most n rows
In [71]:
# 3.1.1.7: Selecting columns, rows and cells of data frames
print(df['a']) # Select one column. Use the column index value i.e. column name
# [column_name] or .column_name (equivalent to $ in R). Depends on if column_name contains spaces.
print(type(df['a'])) # A selected column is a series data type
print(df[['a', 'c']]) # Select multiple columns (not necessarily contiguous)
print('Select the first row:')
print(df.loc[0,:]) # Select rows by the row index using loc
print(df.loc[[0, 1], :]) # Pass a list of row indices to select multiple rows (not necessarily contiguous) )
print('Select a single cell:')
print(df.loc[0, 'a']) # Use loc to select a single cell using indices. Numbering starts with 0.
print(df.iloc[0, 0]) # Use iloc to select single cell using position. Numbering starts with 0.
In [87]:
# 3.1.1.8: Adding and removing columns from a data frame
df['d'] = [7, 6] # Add a new column to an existing data frame by calling the index
print(df)
df = df.drop('a', axis=1) # Remove an existing column from a data frame. axis of 1 indicates columns (axis=0 is rows).
print(df)
In [96]:
# 3.1.1.9: Series operations https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
x = df['d']
print(x)
print('Value Counts:')
print(x.value_counts()) # shows the distribution of values
print('List Values:')
print(x.tolist()) # converts the series to a list
print('Mapped Values:')
print(x.map(lambda x: x**2)) # efficiently applies the function to each element of the series
In [91]:
x
Out[91]:
In [ ]:
# 3.1.2: Data frame operations: droping duplicates, sorting, filtering and formulas
In [99]:
# 3.1.2.1: Using drop_duplicates to return unique combinations of column values
# Analogous to the "remove duplicates" button in Excel
x = {'a': [1, 1, 2, 3], 'b': [3, 3, 3, 3], 'c': [1, 2, 3, 4]}
df = pd.DataFrame(x)
print(df)
print(df.drop_duplicates()) # returns unique rows from the entire data frame
df_sub = df[['a', 'b']] # sub data frame also called column filtering
print(df_sub.drop_duplicates()) # returns the unique values from combinations of a and b
print(df['b'].drop_duplicates()) # returns the unique values of only column b
In [100]:
# 3.1.2.2: Sort a data frame by its columns
# Analogous to the single/multiple sort in Excel
df = df.sort_values(by=['a', 'c'], ascending=[False, True]) # Multiple columns and mixed ascending/descending
print(df) # extend to any number columns of a data frame, in order of priority
print(df.reset_index(drop=True)) # reset the row index
In [101]:
# 3.1.2.3: Filtering. There are two ways to row filter in Pandas.
boolean_series = df['a'] == 1
print('Using the boolean series method:')
print(boolean_series) # Returns a series of booleans
print(df[boolean_series]) # Only select rows where column a is 1, pass a boolean series as an index
print(df[df['a']==1]) # Or use nesting to show all at once
print('Using the query method:') # Use the query method if multiple column conditions are required
print(df.query('a==1 and c==1')) # Note that quotes are not needed for column names
In [102]:
# 3.1.2.4: Missing data and filters
df.loc[0, 'b'] = None
print(df)
boolean_series = df['b'].isnull()
df[boolean_series]
Out[102]:
In [105]:
# 3.1.2.5:
def add(x):
first = x['a']
second = x['c']
return first + second
df.apply(add, axis=1) # applies a function to each row. Function is passed a single row
Out[105]:
In [106]:
# 3.1.3: Exporting Data Frames to Excel and csv
In [107]:
# 3.1.3.1: Exporting to Excel and csv
# Need to include file type extensions in the name
df.to_excel('files/test_out.xlsx', index=False) # need to insure that openpyxl is installed
df.to_csv('files/test_out.csv', index=False)
# Open output files
In [ ]: