In [ ]:
import pandas as pd
In [ ]:
# Use the fake dataset file we created
raw_data_file = 'data/customer_data_mini.csv'
raw_data = pd.read_csv(raw_data_file,
sep=',',
header=0,
index_col=False,
parse_dates=True,
tupleize_cols=False,
error_bad_lines=False,
warn_bad_lines=False,
skip_blank_lines=True
)
raw_data.head()
In [ ]:
raw_data.columns
In [ ]:
raw_data.rename(columns={'birth date': 'date_of_birth',
'customer loyalty level': 'customer_loyalty_level',
'first name': 'first_name',
'last name': 'last_name',
'ssn': 'social_security_number',
'postcode': 'zipcode',
'job': 'position'}, inplace=True)
raw_data.columns
In [ ]:
# Replace all missing values a string - 'Missing'
raw_data.fillna('Missing', inplace=True)
raw_data
In [ ]:
# Replace all missing values with a 0
raw_data.fillna(0, inplace=True)
In [ ]:
# Common practice - replace all missing values with the mean of the dataframe
raw_data.fillna(raw_data.mean(), inplace=True)
In [ ]:
# Common practice - replace the missing values of a single colum with the mean of that column
# DO NOT RUN THIS if you ran the previous cell
raw_data['marketing_score'].fillna(raw_data.mean()['marketing_score'], inplace=True)
raw_data
In [ ]:
import string
exclude = set(string.punctuation)
def remove_punctuation(x):
"""
Helper function to remove punctuation from a string
x: any string
"""
try:
x = ''.join(ch for ch in x if ch not in exclude)
except:
pass
return x
raw_data.last_name = raw_data.last_name.apply(remove_punctuation)
raw_data
In [ ]:
def remove_whitespace(x):
"""
Helper function to remove any blank space from a string
x: any string
"""
try:
x = "".join(x.split())
except:
pass
return x
raw_data.last_name = raw_data.last_name.apply(remove_whitespace)
raw_data
In [ ]:
def remove_internal_abbreviations(s, thing_to_replace, replacement_string):
"""
Helper function to remove things such as 'F/K/A' from a string
s: the string to replace a value in
thing_to_replace: what you want to replace in the given string
replacement_string: the string to use as a replacement
"""
try:
s = s.replace(thing_to_replace, replacement_string)
except:
pass
return s
raw_data['last_name'] = raw_data.apply(lambda x: remove_internal_abbreviations(x['last_name'], "FKA", "-"), axis=1)
raw_data
In [ ]: