Preprocessing is the transformations to your data the happens before training your model. This includes converting your data to its basic numerical components.
Video Tutorial:
In [1]:
# Core Libraries
import os
from fnmatch import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.html import widgets
from IPython.html.widgets import interact
from IPython.display import display
import urllib2
from datetime import datetime
# Feature Extraction
from sklearn.feature_extraction import DictVectorizer
# Preprocessing
from sklearn import preprocessing
# External
from sklearn.externals import joblib
# Hide Warnings
import warnings
warnings.filterwarnings('ignore')
# Configure Pandas
pd.options.display.max_columns = 100
pd.options.display.width = 120
In [4]:
success_alert = """
<div class="alert alert-success" role="alert">Loading data from %s was successful.</div>
"""
error_alert = """
<div class="alert alert-danger" role="alert">Error loading data from %s. %s</div>
"""
def load_data(widget):
global csv_data
path = wgt_file_location.value
try:
if wgt_header.value:
csv_data = pd.read_csv(path, sep=wgt_separator.value)
else:
csv_data = pd.read_csv(path, sep=wgt_separator.value, names=wgt_manual_header.value.split(","))
wgt_alert.value = success_alert % path
except Exception as ex:
print ex
print path
wgt_alert.value = error_alert % (path, ex)
wgt_alert.visible = True
def preview_file(widget):
path = wgt_file_location.value
if path.startswith("http://") or path.startswith("https://") or path.startswith("ftp://"):
raw_file = urllib2.urlopen(path)
wgt_file_preview.value = "<pre>%s</pre>" % raw_file.read(1000)
raw_file.close()
else:
raw_file = open(path)
wgt_file_preview.value = "<pre>%s</pre>" % raw_file.read(1000)
raw_file.close()
def manual_columns(name,old,new):
wgt_manual_header.visible = old
def update_path(name,old,new):
wgt_file_location.value = new
def load_files(widget):
files_list = {}
root = os.curdir
patterns = ["*.txt", "*.csv"]
for path, subdirs, files in os.walk(root):
for name in files:
for pattern in patterns:
if fnmatch(name, pattern):
files_list[os.path.join(path, name)] = os.path.join(path, name)
widget.values = files_list
container = widgets.ContainerWidget()
wgt_alert = widgets.HTMLWidget()
wgt_file_location = widgets.TextWidget(description="Path/URL:")
wgt_file_path = widgets.DropdownWidget(description="Files List")
wgt_separator = widgets.TextWidget(description="Separator", value=",")
wgt_header = widgets.CheckboxWidget(description="First columns is a header?", value=True)
wgt_manual_header = widgets.TextWidget(description="Columns seperated by commas", visible=False)
wgt_load_data = widgets.ButtonWidget(description="Load Data")
wgt_preview_file = widgets.ButtonWidget(description="Preview File")
wgt_file_preview = widgets.HTMLWidget()
wgt_alert.visible = False
wgt_load_data.on_click(load_data)
wgt_preview_file.on_click(preview_file)
wgt_file_path.on_displayed(load_files)
wgt_file_path.on_trait_change(update_path, "value")
wgt_header.on_trait_change(manual_columns, "value")
container.children = (wgt_alert, wgt_file_path, wgt_file_location, wgt_separator, wgt_header, wgt_manual_header,
wgt_load_data, wgt_preview_file, wgt_file_preview)
display(container)
In [5]:
success_alert = """
<div class="alert alert-success" role="alert">Type conversion was successful.</div>
"""
error_alert = """
<div class="alert alert-danger" role="alert">Error in type conversion. %s.</div>
"""
def get_stats(column):
series = pd.Series(csv_data[column].values.ravel())
stats = "Max:<span class='badge'>%s</span>, Min:<span class='badge'>%s</span>,"
stats += "Avg:<span class='badge'>%s</span>, Median:<span class='badge'>%s</span>"
if str(series.dtype) in ["int32", "int64", "float32", "float64"]:
return stats % (series.max(), series.min(), series.mean(), series.median())
else:
return "Not numerical"
def get_column_type(column):
column_type = str(pd.Series(csv_data[column].values.ravel()).dtype)
if column_type == "int64":
return "Int"
elif column_type == "int32":
return "Int"
elif column_type == "float32":
return "Float"
elif column_type == "float64":
return "Float"
elif column_type == "object":
return "Object"
def process_column(column):
column_name = column.children[1].value
data_type = column.children[2].value
if data_type == "Float":
csv_data[column_name] = csv_data[column_name].astype(np.float64)
elif data_type == "Int":
csv_data[column_name] = csv_data[column_name].astype(np.int64)
elif data_type == "Ordinal Date":
csv_data[column_name + "_date"] = csv_data[column_name].apply(datetime.fromordinal)
elif data_type == "Text Date":
csv_data[column_name + "_date"] = csv_data[column_name].astype(str).apply(datetime.strptime,
args=(wgt_date_format.value,))
def process_columns(widget):
try:
for column in main_container.children:
if isinstance(column, widgets.ContainerWidget):
process_column(column)
wgt_alert.value = success_alert
wgt_alert.visible = True
except Exception as ex:
wgt_alert.value = error_alert % ex
wgt_alert.visible = True
main_container = widgets.ContainerWidget()
display(main_container)
columns = []
wgt_alert = widgets.HTMLWidget(visible=False)
wgt_date_format = widgets.TextWidget(description="Text Date Format:" ,value="%Y%m%d")
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_columns)
main_container.children = (wgt_alert, wgt_date_format,)
for column in csv_data.columns:
temp_container = widgets.ContainerWidget()
main_container.children += (temp_container,)
temp_container.remove_class('vbox')
temp_container.add_class('hbox')
temp_container.add_class('start')
w1 = widgets.CheckboxWidget(value=True)
w2 = widgets.TextWidget(value=column, disabled=True)
w3 = widgets.DropdownWidget(values=["Float", "Int", "Ordinal Date", "Text Date", "Boolean", "Object"],
value = get_column_type(column))
w4 = widgets.HTMLWidget()
w2.set_css("width","200px")
stats = get_stats(column)
w4.value = "<pre>%s</pre>" % stats
children = [w1, w2, w3, w4]
temp_container.children = children
main_container.children += (wgt_process,)
ref: https://docs.python.org/2/library/datetime.html
Directive | Meaning | Example | Notes |
---|---|---|---|
%a | Weekday as locale’s abbreviated name. | Sun, Mon, ..., Sat
(en_US);
So, Mo, ..., Sa
(de_DE)
|
(1) |
%A | Weekday as locale’s full name. | Sunday, Monday, ...,
Saturday (en_US);
Sonntag, Montag, ...,
Samstag (de_DE)
|
(1) |
%w | Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. | 0, 1, ..., 6 | |
%d | Day of the month as a zero-padded decimal number. | 01, 02, ..., 31 | |
%b | Month as locale’s abbreviated name. | Jan, Feb, ..., Dec
(en_US);
Jan, Feb, ..., Dez
(de_DE)
|
(1) |
%B | Month as locale’s full name. | January, February,
..., December (en_US);
Januar, Februar, ...,
Dezember (de_DE)
|
(1) |
%m | Month as a zero-padded decimal number. | 01, 02, ..., 12 | |
%y | Year without century as a zero-padded decimal number. | 00, 01, ..., 99 | |
%Y | Year with century as a decimal number. | 1970, 1988, 2001, 2013 | |
%H | Hour (24-hour clock) as a zero-padded decimal number. | 00, 01, ..., 23 | |
%I | Hour (12-hour clock) as a zero-padded decimal number. | 01, 02, ..., 12 | |
%p | Locale’s equivalent of either AM or PM. | AM, PM (en_US);
am, pm (de_DE)
|
(1), (2) |
%M | Minute as a zero-padded decimal number. | 00, 01, ..., 59 | |
%S | Second as a zero-padded decimal number. | 00, 01, ..., 59 | (3) |
%f | Microsecond as a decimal number, zero-padded on the left. | 000000, 000001, ..., 999999 | (4) |
%z | UTC offset in the form +HHMM or -HHMM (empty string if the the object is naive). | (empty), +0000, -0400, +1030 | (5) |
%Z | Time zone name (empty string if the object is naive). | (empty), UTC, EST, CST | |
%j | Day of the year as a zero-padded decimal number. | 001, 002, ..., 366 | |
%U | Week number of the year (Sunday as the first day of the week) as a zero padded decimal number. All days in a new year preceding the first Sunday are considered to be in week 0. | 00, 01, ..., 53 | (6) |
%W | Week number of the year (Monday as the first day of the week) as a decimal number. All days in a new year preceding the first Monday are considered to be in week 0. | 00, 01, ..., 53 | (6) |
%c | Locale’s appropriate date and time representation. | Tue Aug 16 21:30:00
1988 (en_US);
Di 16 Aug 21:30:00
1988 (de_DE)
|
(1) |
%x | Locale’s appropriate date representation. | 08/16/88 (None);
08/16/1988 (en_US);
16.08.1988 (de_DE)
|
(1) |
%X | Locale’s appropriate time representation. | 21:30:00 (en_US);
21:30:00 (de_DE)
|
(1) |
%% | A literal '%' character. | % |
In [6]:
success_alert = """
<div class="alert alert-success" role="alert">Text features extraction was successful.</div>
"""
error_alert = """
<div class="alert alert-danger" role="alert">Error in features extraction. %s.</div>
"""
def get_map_dict(column):
keys = csv_data[column].unique()
values = xrange(len(keys))
return {key:value for key,value in zip(keys,values)}
def process_text_column(column):
column_name = column.children[0].value
text_process = column.children[1].value
map_dict = eval(column.children[2].value)
if text_process == "Map":
csv_data[column_name + "_mapped"] = csv_data[column_name].map(map_dict)
elif text_process == "Binary Vectorize":
temp_dict = [{column_name: item} for item in csv_data[column_name]]
vec = DictVectorizer(separator="_is_")
vec_list = vec.fit_transform(temp_dict).toarray()
columns = vec.get_feature_names()
for counter in range(len(columns)):
column = columns[counter]
values = vec_list[:,counter]
csv_data[column] = values
def process_text_columns(widget):
try:
for column in main_container.children:
if isinstance(column, widgets.ContainerWidget):
process_text_column(column)
wgt_alert.value = success_alert
wgt_alert.visible = True
except Exception as ex:
wgt_alert.value = error_alert % ex
wgt_alert.visible = True
main_container = widgets.ContainerWidget()
display(main_container)
wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_text_columns)
main_container.children = (wgt_alert,)
columns = []
for column in csv_data.columns:
if str(pd.Series(csv_data[column].values).dtype) == "object":
temp_container = widgets.ContainerWidget()
main_container.children += (temp_container,)
temp_container.remove_class('vbox')
temp_container.add_class('hbox')
temp_container.add_class('start')
w1 = widgets.TextWidget(value=column, disabled=True)
w2 = widgets.DropdownWidget(values=["Map", "Binary Vectorize", "Don't Process"])
w3 = widgets.TextWidget(description="Dict: {'m': 0, 'f': 1}")
w1.set_css("width","200px")
w3.value = str(get_map_dict(column))
children = [w1, w2, w3]
temp_container.children = children;
main_container.children += (wgt_process,)
In [8]:
def print_preview():
print "Data Sample:"
print csv_data.head(5)
print ".\n" * 3
print csv_data.tail(5)
print_preview()
In [9]:
success_alert = """
<div class="alert alert-success" role="alert">Date features extraction was successful.</div>
"""
error_alert = """
<div class="alert alert-danger" role="alert">Error in date extraction. %s.</div>
"""
def process_date_column(column):
column_name = column.children[0].value
if column.children[1].value: # Year
csv_data[column_name + "_year"] = csv_data[column_name].apply(lambda x: x.year)
if column.children[2].value: # Month
csv_data[column_name + "_month"] = csv_data[column_name].apply(lambda x: x.month)
if column.children[3].value: # Day
csv_data[column_name + "_day"] = csv_data[column_name].apply(lambda x: x.day)
if column.children[4].value: # Day of week
csv_data[column_name + "_dayofweek"] = csv_data[column_name].apply(lambda x: x.dayofweek)
if column.children[5].value: # Hour
csv_data[column_name + "_hour"] = csv_data[column_name].apply(lambda x: x.hour)
if column.children[6].value: # Minute
csv_data[column_name + "_minute"] = csv_data[column_name].apply(lambda x: x.minute)
if column.children[7].value: # Second
csv_data[column_name + "_second"] = csv_data[column_name].apply(lambda x: x.second)
#if column.children[8].value: # Micro Second
# csv_data[column_name + "_microsecond"] = csv_data[column_name].apply(lambda x: datetime.microsecond)
def process_date_columns(widget):
try:
for column in main_container.children:
if isinstance(column, widgets.ContainerWidget):
process_date_column(column)
wgt_alert.value = success_alert
wgt_alert.visible = True
except Exception as ex:
wgt_alert.value = error_alert % ex
wgt_alert.visible = True
main_container = widgets.ContainerWidget()
display(main_container)
wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_date_columns)
main_container.children = (wgt_alert,)
columns = []
for column in csv_data.columns:
if str(pd.Series(csv_data[column].values).dtype) == "datetime64[ns]":
temp_container = widgets.ContainerWidget()
main_container.children += (temp_container,)
temp_container.remove_class('vbox')
temp_container.add_class('hbox')
temp_container.add_class('start')
w1 = widgets.TextWidget(value=column, disabled=True)
w2 = widgets.CheckboxWidget(description="Year", value=True)
w3 = widgets.CheckboxWidget(description="Month", value=True)
w4 = widgets.CheckboxWidget(description="Day", value=True)
w5 = widgets.CheckboxWidget(description="DayOfWeek", value=True)
w6 = widgets.CheckboxWidget(description="Hour", value=True)
w7 = widgets.CheckboxWidget(description="Minute", value=True)
w8 = widgets.CheckboxWidget(description="Second", value=True)
#w9 = widgets.CheckboxWidget(description="MS", value=True)
w1.set_css("width","200px")
children = [w1, w2, w3, w4, w5, w6, w7, w8]
temp_container.children = children;
main_container.children += (wgt_process,)
In [10]:
print_preview()
In [11]:
success_alert = """
<div class="alert alert-success" role="alert">Numbers processing was successful.</div>
"""
error_alert = """
<div class="alert alert-danger" role="alert">Error in numbers processing. %s.</div>
"""
def process_number_column(column):
column_name = column.children[0].value
number_process = column.children[1].value
scale_min = column.children[2].value
scale_max = column.children[3].value
if number_process == "Scale":
pre_process = preprocessing.MinMaxScaler(feature_range=(scale_min, scale_max))
csv_data[column_name + "_scaled"] = pre_process.fit_transform(csv_data[[column_name]].astype(np.float64).values)
elif number_process == "Standard Scaler":
pre_process = preprocessing.StandardScaler()
csv_data[column_name + "_standardscaler"] = pre_process.fit_transform(csv_data[column_name].copy().astype(np.float64))
def process_number_columns(widget):
if True: #try:
for column in main_container.children:
if isinstance(column, widgets.ContainerWidget):
process_number_column(column)
wgt_alert.value = success_alert
wgt_alert.visible = True
else: #except Exception as ex:
raise ex
wgt_alert.value = error_alert % ex
wgt_alert.visible = True
main_container = widgets.ContainerWidget()
display(main_container)
wgt_alert = widgets.HTMLWidget(visible=False)
wgt_process = widgets.ButtonWidget(description="Process Columns")
wgt_process.on_click(process_number_columns)
main_container.children = (wgt_alert,)
columns = []
for column in csv_data.columns:
data_type = str(pd.Series(csv_data[column].values).dtype)
if data_type in ["float32", "float64", "int32", "int64"]:
temp_container = widgets.ContainerWidget()
main_container.children += (temp_container,)
temp_container.remove_class('vbox')
temp_container.add_class('hbox')
temp_container.add_class('start')
w1 = widgets.TextWidget(value=column, disabled=True)
w2 = widgets.DropdownWidget(values=["Scale", "Standard Scaler", "Don't Process"], value="Don't Process")
w3 = widgets.FloatTextWidget(description="Scale Min:", value=0)
w4 = widgets.FloatTextWidget(description="Scale Max:", value=1)
w1.set_css("width","200px")
w3.set_css("width","50px")
w4.set_css("width","50px")
children = [w1, w2, w3, w4]
temp_container.children = children;
main_container.children += (wgt_process,)
In [13]:
def display_feature(feature_name):
plt.figure()
plt.scatter(list(csv_data.index), csv_data[feature_name])
plt.grid()
plt.ylabel(feature_name)
plt.show()
print "Mean: %4f" % np.mean(csv_data[feature_name])
print "Std : %4f" % np.std(csv_data[feature_name])
wgt_column = widgets.SelectWidget(values=list(csv_data._get_numeric_data().columns))
interact(display_feature, feature_name=wgt_column);
In [14]:
csv_data.save("processed_data.csv")
In [15]:
joblib.dump(csv_data, "processed_data.pkl")
Out[15]: