Standardize Data With Pandas

Import The Pandas And Numpy Libraries


In [ ]:
import pandas as pd
import numpy as np

# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)

Merge Two Datasets


In [ ]:
# Define the base path of the data
# Change this to the full path of the data files
base_file_path = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/Books/'\
                 'Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/'

accidents_data_file = 'Accidents7904.csv'
casualty_data_file = 'Casualty7904.csv'
vehicles_data_file = 'Vehicles7904.csv'
vehicle_types_file = 'vehicle_types.csv'

In [ ]:
# Load the acidents dataset into a pandas dataset using the 'Accident_Index' column as the index of the dataframe
af = base_file_path + accidents_data_file
accidents = pd.read_csv(af,
                        sep=',',
                        header=0,
                        index_col=0,
                        parse_dates=False,
                        tupleize_cols=False,
                        error_bad_lines=False,
                        warn_bad_lines=False,
                        skip_blank_lines=True,
                        nrows=1000
                        )
accidents.head()

In [ ]:
# Load the casualties dataset into a pandas dataset using the 'Acc_Index' column as the index of the dataframe
cf = base_file_path + casualty_data_file
casualties = pd.read_csv(cf,
                        sep=',',
                        header=0,
                        index_col=0,
                        parse_dates=False,
                        tupleize_cols=False,
                        error_bad_lines=False,
                        warn_bad_lines=False,
                        skip_blank_lines=True,
                        nrows=1000
                        )
casualties.head()

In [ ]:
# Merge the two datasets using a left join, like in SQL, based on the index values, which in this case is the accident_index
merged_data = pd.merge(accidents, casualties, how = 'left', left_index = True, right_index = True)
merged_data.head()

# Uncomment the following lines to create a CSV file of the merged data
# merged_data.to_csv('merged_dataset.csv', sep=',', encoding='utf-8')
# print("CSV creation complete")

Reformat data: uppercase a column


In [ ]:
# Create a new dataframe to use for the following examples
# All data created using the Faker library: https://github.com/joke2k/faker
lc = pd.DataFrame({'people' : ["cole o'brien", "lise heidenreich", "zilpha skiles", "damion wisozk"],
                   'age' : [24, 35, 46, 57],
                   'ssn': ['6439', '689 24 9939', '306-05-2792', '992245832'],
                   'birth_date': ['2/15/54', '05/07/1958', '19XX-10-23', '01/26/0056'],
                   'customer_loyalty_level' : ['not at all', 'moderate', 'moderate', 'highly loyal']})
lc

In [ ]:
# Create the function to uppercase a string
def uppercase_string(s):
    """
    Standardizes a string by making it all caps
    :param s: string to uppercase
    :return: s
    """
    try:
        s = s.upper()
    except:
        pass
    return s
lc

In [ ]:
# Apply the function to the dataframe
lc.customer_loyalty_level = lc.customer_loyalty_level.apply(uppercase_string)
lc

Titlecase anything


In [ ]:
# Before running this code, install the titlecase library: pip install titlecase
from titlecase import titlecase

def titlecase_anything(thing):
    """
    Uses the titlecase library to titlecase a string
    :param thing: the thing to titlecase
    :return: thing
    """
    try:
        thing = titlecase(thing)
    except:
        pass
    return thing
lc

In [ ]:
# Apply the function to the dataframe
lc.people = lc.people.apply(titlecase_anything)
lc

Update values in place


In [ ]:
# Add a new column with no data
lc['marketing_score'] = np.nan
lc

In [ ]:
# Fill in the missing data of the chosen column with the text 'Missing', in place using inplace=True
# If we didn't use the inplace argument our update wouldn't stay
lc.marketing_score.fillna(0, inplace=True)
lc

Standardize a social security number


In [ ]:
def right(s, amount):
    """
    Returns a specified number of characters from a string starting on the right side
    :param s: string to extract the characters from
    :param amount: the number of characters to extract from the string
    """
    return s[-amount:]

def standardize_ssn(ssn):
    """
    Standardizes the SSN by removing any spaces, "XXXX", and dashes
    :param ssn: ssn to standardize
    :return: formatted_ssn
    """
    try:
        ssn = ssn.replace("-","")
        ssn = "".join(ssn.split())
        if len(ssn) < 9 and ssn != 'Missing':
            ssn = "000000000" + ssn
            ssn = right(ssn, 9)
    except:
        pass

    return ssn

lc

In [ ]:
# Apply the function to the dataframe
lc.ssn = lc.ssn.apply(standardize_ssn)
lc

Standardize dates


In [ ]:
from time import strftime
from datetime import datetime

def standardize_date(the_date):
    """
    Standardizes a date
    :param the_date: the date to standardize
    :return formatted_date
    """

    # Convert what we have to a string, just in case
    the_date = str(the_date)
    
    # Handle missing dates, however pandas should have filled this in as missing
    if not the_date or the_date.lower() == "missing" or the_date == "nan":
        formatted_date = "MISSING"

    # Handle dates that end with 'XXXX', start with 'XX', or are less than 1900
    if the_date.lower().find('x') != -1:
        formatted_date = "Incomplete"

    # Handle dates that start with something like "0056"
    if the_date[0:2] == "00":
        formatted_date = the_date.replace("00", "19")

    # 03/03/15
    try:
        formatted_date = str(datetime.strptime(the_date, '%m/%d/%y').strftime('%m/%d/%y'))
    except:
        pass
    
    # 03/03/2015
    try:
        formatted_date = str(datetime.strptime(the_date, '%m/%d/%Y').strftime('%m/%d/%y'))
    except:
        pass

    # 0000-03-03
    try:
        if int(the_date[0:4]) < 1900:
            formatted_date = "Incomplete"
        else:
            formatted_date = str(datetime.strptime(the_date, '%Y-%m-%d').strftime('%m/%d/%y'))
    except:
        pass

    return formatted_date

lc

In [ ]:
# Apply the function to the dataframe
lc.birth_date = lc.birth_date.apply(standardize_date)
lc

Convert categories to numbers for a speed boost


In [ ]:
lc.dtypes

In [ ]:
# Convert the customer_loyalty_level categories to numerics for faster processing
lc.customer_loyalty_level = lc.customer_loyalty_level.astype('category')
lc.customer_loyalty_level

In [ ]:
# Look at the data types after the conversion of the customer_loyalty_level column
lc.dtypes

In [ ]:
# We can describe the newly converted column
lc.customer_loyalty_level.describe()

In [ ]: