notebook.community

Edit and run



In [63]:

    
# 3.1: Data Frames - more at https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html



In [5]:

    
# Import Pandas and configure it not to be in scientific notation
import pandas as pd
pd.set_option('display.float_format', '{:,.2f}'.format) # similar to options(scipen=999) in R



In [3]:

    
# 3.1.1: Fundamentals of data frames



In [64]:

    
# 3.1.1.1: Construct a data frame from a dictionary
x_dict = {'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}
df = pd.DataFrame(x_dict) # DataFrame class constructor from a dictionary
print(df) # 0, 1 is the index of the data frame



In [75]:

    
# 3.1.1.2: Construct a data frame from a comma-separated value (csv) file
converters = {'a': int, 'b': int, 'c': int}
df = pd.read_csv('files/test_in.csv', converters=converters) # use a sep='\t' for a tab-separated value (tsv) file
df



In [76]:

    
# 3.1.1.3: Construct a data frame from a fixed-width file
df = pd.read_fwf('files/test_in.txt', widths=[3, 3, 3], converters=converters) 
df



In [77]:

    
# 3.1.1.4: Construct a data frame from an Excel file 
df = pd.read_excel('files/test_in.xlsx', 'Sheet1', converters=converters) # need to install the xlrd (Excel read) module first
df



In [80]:

    
# 3.1.1.5: Fundamental data frame properties
print(df.shape) # dimensions of the data frame i.e. # rows by # columns
print(df.columns) # the index of column names.  dtype of object means that the column names are strings.
print(df.dtypes) # the data type of each column
print(df.index) # the row index of the data frame (similar to primary key in SQL)









    



(2, 3)
Index(['a', 'b', 'c'], dtype='object')
a    int64
b    int64
c    int64
dtype: object
RangeIndex(start=0, stop=2, step=1)



In [79]:

    
# 3.1.1.6: Fundamental data frame methods
print(df.describe()) # summary statistics and percentiles of every numerical column
print(df.head(1)) # df.head(n) only select the top most n rows
print(df.tail(1)) # df.tail(n) only select the bottom most n rows









    



         a    b    c
count 2.00 2.00 2.00
mean  1.50 3.50 5.50
std   0.71 0.71 0.71
min   1.00 3.00 5.00
25%   1.25 3.25 5.25
50%   1.50 3.50 5.50
75%   1.75 3.75 5.75
max   2.00 4.00 6.00
   a  b  c
0  1  3  5
   a  b  c
1  2  4  6



In [71]:

    
# 3.1.1.7: Selecting columns, rows and cells of data frames
print(df['a']) # Select one column.  Use the column index value i.e. column name
# [column_name] or .column_name (equivalent to $ in R).  Depends on if column_name contains spaces.
print(type(df['a'])) # A selected column is a series data type
print(df[['a', 'c']]) # Select multiple columns (not necessarily contiguous)
print('Select the first row:')
print(df.loc[0,:]) # Select rows by the row index using loc
print(df.loc[[0, 1], :]) # Pass a list of row indices to select multiple rows (not necessarily contiguous) )
print('Select a single cell:')
print(df.loc[0, 'a']) # Use loc to select a single cell using indices.  Numbering starts with 0.
print(df.iloc[0, 0]) # Use iloc to select single cell using position.  Numbering starts with 0.









    



0    1
1    1
2    2
3    3
Name: a, dtype: int64
<class 'pandas.core.series.Series'>
   a  c
0  1  1
1  1  2
2  2  3
3  3  4
Select the first row:
a    1
b    3
c    1
Name: 0, dtype: int64
   a  b  c
0  1  3  1
1  1  3  2
Select a single cell:
1
1



In [87]:

    
# 3.1.1.8: Adding and removing columns from a data frame
df['d'] = [7, 6] # Add a new column to an existing data frame by calling the index
print(df)
df = df.drop('a', axis=1) # Remove an existing column from a data frame. axis of 1 indicates columns (axis=0 is rows).
print(df)









    



   a    b  c  d
1  2 4.00  6  7
0  1  nan  5  6
     b  c  d
1 4.00  6  7
0  nan  5  6



In [96]:

    
# 3.1.1.9: Series operations https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
x = df['d']
print(x)
print('Value Counts:')
print(x.value_counts()) # shows the distribution of values
print('List Values:')
print(x.tolist()) # converts the series to a list
print('Mapped Values:')
print(x.map(lambda x: x**2)) # efficiently applies the function to each element of the series









    



1    7
0    6
Name: d, dtype: int64
Value Counts:
7    1
6    1
Name: d, dtype: int64
List Values:
[7, 6]
Mapped Values:
1    49
0    36
Name: d, dtype: int64



In [91]:

    
x









    Out[91]:





1    7
0    6
Name: d, dtype: int64



In [ ]:

    
# 3.1.2: Data frame operations: droping duplicates, sorting, filtering and formulas



In [99]:

    
# 3.1.2.1: Using drop_duplicates to return unique combinations of column values
# Analogous to the "remove duplicates" button in Excel
x = {'a': [1, 1, 2, 3], 'b': [3, 3, 3, 3], 'c': [1, 2, 3, 4]}
df = pd.DataFrame(x)
print(df)
print(df.drop_duplicates()) # returns unique rows from the entire data frame
df_sub = df[['a', 'b']] # sub data frame also called column filtering
print(df_sub.drop_duplicates()) # returns the unique values from combinations of a and b
print(df['b'].drop_duplicates()) # returns the unique values of only column b









    



   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b  c
0  1  3  1
1  1  3  2
2  2  3  3
3  3  3  4
   a  b
0  1  3
2  2  3
3  3  3
0    3
Name: b, dtype: int64



In [100]:

    
# 3.1.2.2: Sort a data frame by its columns
# Analogous to the single/multiple sort in Excel
df = df.sort_values(by=['a', 'c'], ascending=[False, True]) # Multiple columns and mixed ascending/descending
print(df) # extend to any number columns of a data frame, in order of priority
print(df.reset_index(drop=True)) # reset the row index



In [101]:

    
# 3.1.2.3: Filtering.  There are two ways to row filter in Pandas.
boolean_series = df['a'] == 1
print('Using the boolean series method:')
print(boolean_series) # Returns a series of booleans
print(df[boolean_series]) # Only select rows where column a is 1, pass a boolean series as an index
print(df[df['a']==1]) # Or use nesting to show all at once
print('Using the query method:') # Use the query method if multiple column conditions are required
print(df.query('a==1 and c==1')) # Note that quotes are not needed for column names









    



Using the boolean series method:
3    False
2    False
0     True
1     True
Name: a, dtype: bool
   a  b  c
0  1  3  1
1  1  3  2
   a  b  c
0  1  3  1
1  1  3  2
Using the query method:
   a  b  c
0  1  3  1



In [102]:

    
# 3.1.2.4: Missing data and filters
df.loc[0, 'b'] = None
print(df)
boolean_series = df['b'].isnull()
df[boolean_series]









    



   a    b  c
3  3 3.00  4
2  2 3.00  3
0  1  nan  1
1  1 3.00  2






    Out[102]:







  
    
      
      a
      b
      c
    
  
  
    
      0
      1
      nan
      1



In [105]:

    
# 3.1.2.5: 
def add(x):
    first = x['a']
    second = x['c']
    return first + second
df.apply(add, axis=1) # applies a function to each row.  Function is passed a single row









    Out[105]:





3   7.00
2   5.00
0   2.00
1   3.00
dtype: float64



In [106]:

    
# 3.1.3: Exporting Data Frames to Excel and csv



In [107]:

    
# 3.1.3.1: Exporting to Excel and csv
# Need to include file type extensions in the name
df.to_excel('files/test_out.xlsx', index=False) # need to insure that openpyxl is installed
df.to_csv('files/test_out.csv', index=False)
# Open output files



In [ ]: