In [63]:
# 3.1: Data Frames - more at https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html

In [5]:
# Import Pandas and configure it not to be in scientific notation
import pandas as pd
pd.set_option('display.float_format', '{:,.2f}'.format) # similar to options(scipen=999) in R

In [3]:
# 3.1.1: Fundamentals of data frames

In [64]:
# 3.1.1.1: Construct a data frame from a dictionary
x_dict = {'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}
df = pd.DataFrame(x_dict) # DataFrame class constructor from a dictionary
print(df) # 0, 1 is the index of the data frame


   a  b  c
0  1  3  5
1  2  4  6

In [75]:
# 3.1.1.2: Construct a data frame from a comma-separated value (csv) file
converters = {'a': int, 'b': int, 'c': int}
df = pd.read_csv('files/test_in.csv', converters=converters) # use a sep='\t' for a tab-separated value (tsv) file
df


Out[75]:
a b c
0 1 3 5
1 2 4 6

In [76]:
# 3.1.1.3: Construct a data frame from a fixed-width file
df = pd.read_fwf('files/test_in.txt', widths=[3, 3, 3], converters=converters) 
df


Out[76]:
a b c
0 1 3 5
1 2 4 6

In [77]:
# 3.1.1.4: Construct a data frame from an Excel file 
df = pd.read_excel('files/test_in.xlsx', 'Sheet1', converters=converters) # need to install the xlrd (Excel read) module first
df


Out[77]:
a b c
0 1 3 5
1 2 4 6

In [80]:
# 3.1.1.5: Fundamental data frame properties
print(df.shape) # dimensions of the data frame i.e. # rows by # columns
print(df.columns) # the index of column names.  dtype of object means that the column names are strings.
print(df.dtypes) # the data type of each column
print(df.index) # the row index of the data frame (similar to primary key in SQL)


(2, 3)
Index(['a', 'b', 'c'], dtype='object')
a    int64
b    int64
c    int64
dtype: object
RangeIndex(start=0, stop=2, step=1)

In [79]:
# 3.1.1.6: Fundamental data frame methods
print(df.describe()) # summary statistics and percentiles of every numerical column
print(df.head(1)) # df.head(n) only select the top most n rows
print(df.tail(1)) # df.tail(n) only select the bottom most n rows


         a    b    c
count 2.00 2.00 2.00
mean  1.50 3.50 5.50
std   0.71 0.71 0.71
min   1.00 3.00 5.00
25%   1.25 3.25 5.25
50%   1.50 3.50 5.50
75%   1.75 3.75 5.75
max   2.00 4.00 6.00
   a  b  c
0  1  3  5
   a  b  c
1  2  4  6

In [71]:
# 3.1.1.7: Selecting columns, rows and cells of data frames
print(df['a']) # Select one column.  Use the column index value i.e. column name
# [column_name] or .column_name (equivalent to $ in R).  Depends on if column_name contains spaces.
print(type(df['a'])) # A selected column is a series data type
print(df[['a', 'c']]) # Select multiple columns (not necessarily contiguous)
print('Select the first row:')
print(df.loc[0,:]) # Select rows by the row index using loc
print(df.loc[[0, 1], :]) # Pass a list of row indices to select multiple rows (not necessarily contiguous) )
print('Select a single cell:')
print(df.loc[0, 'a']) # Use loc to select a single cell using indices.  Numbering starts with 0.
print(df.iloc[0, 0]) # Use iloc to select single cell using position.  Numbering starts with 0.


0    1
1    1
2    2
3    3
Name: a, dtype: int64
<class 'pandas.core.series.Series'>
   a  c
0  1  1
1  1  2
2  2  3
3  3  4
Select the first row:
a    1
b    3
c    1
Name: 0, dtype: int64
   a  b  c
0  1  3  1
1  1  3  2
Select a single cell:
1
1

In [87]:
# 3.1.1.8: Adding and removing columns from a data frame
df['d'] = [7, 6] # Add a new column to an existing data frame by calling the index
print(df)
df = df.drop('a', axis=1) # Remove an existing column from a data frame. axis of 1 indicates columns (axis=0 is rows).
print(df)


   a    b  c  d
1  2 4.00  6  7
0  1  nan  5  6
     b  c  d
1 4.00  6  7
0  nan  5  6

In [96]:
# 3.1.1.9: Series operations https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
x = df['d']
print(x)
print('Value Counts:')
print(x.value_counts()) # shows the distribution of values
print('List Values:')
print(x.tolist()) # converts the series to a list
print('Mapped Values:')
print(x.map(lambda x: x**2)) # efficiently applies the function to each element of the series


1    7
0    6
Name: d, dtype: int64
Value Counts:
7    1
6    1
Name: d, dtype: int64
List Values:
[7, 6]
Mapped Values:
1    49
0    36
Name: d, dtype: int64

In [91]:
x


Out[91]:
1    7
0    6
Name: d, dtype: int64

In [ ]:
# 3.1.2: Data frame operations: droping duplicates, sorting, filtering and formulas

In [99]:
# 3.1.2.1: Using drop_duplicates to return unique combinations of column values
# Analogous to the "remove duplicates" button in Excel
x = {'a': [1, 1, 2, 3], 'b': [3, 3, 3, 3], 'c': [1, 2, 3, 4]}
df = pd.DataFrame(x)
print(df)
print(df.drop_duplicates()) # returns unique rows from the entire data frame
df_sub = df[['a', 'b']] # sub data frame also called column filtering
print(df_sub.drop_duplicates()) # returns the unique values from combinations of a and b
print(df['b'].drop_duplicates()) # returns the unique values of only column b


   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b
0  1  3
2  2  3
3  3  3
0    3
Name: b, dtype: int64

In [100]:
# 3.1.2.2: Sort a data frame by its columns
# Analogous to the single/multiple sort in Excel
df = df.sort_values(by=['a', 'c'], ascending=[False, True]) # Multiple columns and mixed ascending/descending
print(df) # extend to any number columns of a data frame, in order of priority
print(df.reset_index(drop=True)) # reset the row index


   a  b  c
3  3  3  4
2  2  3  3
0  1  3  1
1  1  3  2
   a  b  c
0  3  3  4
1  2  3  3
2  1  3  1
3  1  3  2

In [101]:
# 3.1.2.3: Filtering.  There are two ways to row filter in Pandas.
boolean_series = df['a'] == 1
print('Using the boolean series method:')
print(boolean_series) # Returns a series of booleans
print(df[boolean_series]) # Only select rows where column a is 1, pass a boolean series as an index
print(df[df['a']==1]) # Or use nesting to show all at once
print('Using the query method:') # Use the query method if multiple column conditions are required
print(df.query('a==1 and c==1')) # Note that quotes are not needed for column names


Using the boolean series method:
3    False
2    False
0     True
1     True
Name: a, dtype: bool
   a  b  c
0  1  3  1
1  1  3  2
   a  b  c
0  1  3  1
1  1  3  2
Using the query method:
   a  b  c
0  1  3  1

In [102]:
# 3.1.2.4: Missing data and filters
df.loc[0, 'b'] = None
print(df)
boolean_series = df['b'].isnull()
df[boolean_series]


   a    b  c
3  3 3.00  4
2  2 3.00  3
0  1  nan  1
1  1 3.00  2
Out[102]:
a b c
0 1 nan 1

In [105]:
# 3.1.2.5: 
def add(x):
    first = x['a']
    second = x['c']
    return first + second
df.apply(add, axis=1) # applies a function to each row.  Function is passed a single row


Out[105]:
3   7.00
2   5.00
0   2.00
1   3.00
dtype: float64

In [106]:
# 3.1.3: Exporting Data Frames to Excel and csv

In [107]:
# 3.1.3.1: Exporting to Excel and csv
# Need to include file type extensions in the name
df.to_excel('files/test_out.xlsx', index=False) # need to insure that openpyxl is installed
df.to_csv('files/test_out.csv', index=False)
# Open output files

In [ ]: