In [ ]:
import pandas as pd
import numpy as np
# Set Pandas display options so we can see more data
pd.set_option('display.width', 1000)
In [ ]:
# Define the base path of the data
# Change this to the full path of the data files
base_file_path = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/Books/'\
'Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/'
accidents_data_file = 'Accidents7904.csv'
casualty_data_file = 'Casualty7904.csv'
vehicles_data_file = 'Vehicles7904.csv'
vehicle_types_file = 'vehicle_types.csv'
In [ ]:
# Load the acidents dataset into a pandas dataset using the 'Accident_Index' column as the index of the dataframe
af = base_file_path + accidents_data_file
accidents = pd.read_csv(af,
sep=',',
header=0,
index_col=0,
parse_dates=False,
tupleize_cols=False,
error_bad_lines=False,
warn_bad_lines=False,
skip_blank_lines=True,
nrows=1000
)
accidents.head()
In [ ]:
# Load the casualties dataset into a pandas dataset using the 'Acc_Index' column as the index of the dataframe
cf = base_file_path + casualty_data_file
casualties = pd.read_csv(cf,
sep=',',
header=0,
index_col=0,
parse_dates=False,
tupleize_cols=False,
error_bad_lines=False,
warn_bad_lines=False,
skip_blank_lines=True,
nrows=1000
)
casualties.head()
In [ ]:
# Merge the two datasets using a left join, like in SQL, based on the index values, which in this case is the accident_index
merged_data = pd.merge(accidents, casualties, how = 'left', left_index = True, right_index = True)
merged_data.head()
# Uncomment the following lines to create a CSV file of the merged data
# merged_data.to_csv('merged_dataset.csv', sep=',', encoding='utf-8')
# print("CSV creation complete")
In [ ]:
# Create a new dataframe to use for the following examples
# All data created using the Faker library: https://github.com/joke2k/faker
lc = pd.DataFrame({'people' : ["cole o'brien", "lise heidenreich", "zilpha skiles", "damion wisozk"],
'age' : [24, 35, 46, 57],
'ssn': ['6439', '689 24 9939', '306-05-2792', '992245832'],
'birth_date': ['2/15/54', '05/07/1958', '19XX-10-23', '01/26/0056'],
'customer_loyalty_level' : ['not at all', 'moderate', 'moderate', 'highly loyal']})
lc
In [ ]:
# Create the function to uppercase a string
def uppercase_string(s):
"""
Standardizes a string by making it all caps
:param s: string to uppercase
:return: s
"""
try:
s = s.upper()
except:
pass
return s
lc
In [ ]:
# Apply the function to the dataframe
lc.customer_loyalty_level = lc.customer_loyalty_level.apply(uppercase_string)
lc
In [ ]:
# Before running this code, install the titlecase library: pip install titlecase
from titlecase import titlecase
def titlecase_anything(thing):
"""
Uses the titlecase library to titlecase a string
:param thing: the thing to titlecase
:return: thing
"""
try:
thing = titlecase(thing)
except:
pass
return thing
lc
In [ ]:
# Apply the function to the dataframe
lc.people = lc.people.apply(titlecase_anything)
lc
In [ ]:
# Add a new column with no data
lc['marketing_score'] = np.nan
lc
In [ ]:
# Fill in the missing data of the chosen column with the text 'Missing', in place using inplace=True
# If we didn't use the inplace argument our update wouldn't stay
lc.marketing_score.fillna(0, inplace=True)
lc
In [ ]:
def right(s, amount):
"""
Returns a specified number of characters from a string starting on the right side
:param s: string to extract the characters from
:param amount: the number of characters to extract from the string
"""
return s[-amount:]
def standardize_ssn(ssn):
"""
Standardizes the SSN by removing any spaces, "XXXX", and dashes
:param ssn: ssn to standardize
:return: formatted_ssn
"""
try:
ssn = ssn.replace("-","")
ssn = "".join(ssn.split())
if len(ssn) < 9 and ssn != 'Missing':
ssn = "000000000" + ssn
ssn = right(ssn, 9)
except:
pass
return ssn
lc
In [ ]:
# Apply the function to the dataframe
lc.ssn = lc.ssn.apply(standardize_ssn)
lc
In [ ]:
from time import strftime
from datetime import datetime
def standardize_date(the_date):
"""
Standardizes a date
:param the_date: the date to standardize
:return formatted_date
"""
# Convert what we have to a string, just in case
the_date = str(the_date)
# Handle missing dates, however pandas should have filled this in as missing
if not the_date or the_date.lower() == "missing" or the_date == "nan":
formatted_date = "MISSING"
# Handle dates that end with 'XXXX', start with 'XX', or are less than 1900
if the_date.lower().find('x') != -1:
formatted_date = "Incomplete"
# Handle dates that start with something like "0056"
if the_date[0:2] == "00":
formatted_date = the_date.replace("00", "19")
# 03/03/15
try:
formatted_date = str(datetime.strptime(the_date, '%m/%d/%y').strftime('%m/%d/%y'))
except:
pass
# 03/03/2015
try:
formatted_date = str(datetime.strptime(the_date, '%m/%d/%Y').strftime('%m/%d/%y'))
except:
pass
# 0000-03-03
try:
if int(the_date[0:4]) < 1900:
formatted_date = "Incomplete"
else:
formatted_date = str(datetime.strptime(the_date, '%Y-%m-%d').strftime('%m/%d/%y'))
except:
pass
return formatted_date
lc
In [ ]:
# Apply the function to the dataframe
lc.birth_date = lc.birth_date.apply(standardize_date)
lc
In [ ]:
lc.dtypes
In [ ]:
# Convert the customer_loyalty_level categories to numerics for faster processing
lc.customer_loyalty_level = lc.customer_loyalty_level.astype('category')
lc.customer_loyalty_level
In [ ]:
# Look at the data types after the conversion of the customer_loyalty_level column
lc.dtypes
In [ ]:
# We can describe the newly converted column
lc.customer_loyalty_level.describe()
In [ ]: