Generate Training Data with All Features


In [ ]:
#######################################################
# Script:
#    genFeatures.py
# Usage:
#    python genFeatures.py <input_file>
# Description:
#    Generate feature data set for performance metrics
# Authors:
#    Jasmin Nakic, jnakic@salesforce.com
#    Samir Pilipovic, spilipovic@salesforce.com
#######################################################
import sys
import math
import numpy as np
from datetime import datetime

# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [ ]:
# Calculate datetime in a fractional format
def getOrdinalFrac(dateObj):
    # Return the proleptic Gregorian ordinal of the date + time as date fraction
    dtFrac = dateObj.toordinal() + dateObj.hour/24.0 + dateObj.minute / 1440.0
    return dtFrac
#end getOrdinalFrac

# Generate all features for a specific event
def process(r,data):
    # Tab delimited file where col #1 is timestamp and col #2 is the metric value
    vals = data.split('\t')
    timeStamp = vals[0]
    cnt = int(vals[1])
    dt = datetime.strptime(timeStamp,"%y-%m-%d %H:%M")
    timeStamp = "20" + timeStamp + ":00"
    dateStr = dt.strftime("%Y-%m-%d")
    ordinalFrac = getOrdinalFrac(dt)

    dayInWeek = dt.weekday()
    weekdaySin = math.sin(dayInWeek*2*math.pi/7)
    weekdayCos = math.cos(dayInWeek*2*math.pi/7)

    hourInDay = dt.hour
    hourSin = math.sin(hourInDay*2*math.pi/24)
    hourCos = math.cos(hourInDay*2*math.pi/24)

    isMonday = 1    if dayInWeek == 0 else 0
    isTuesday = 1   if dayInWeek == 1 else 0
    isWednesday = 1 if dayInWeek == 2 else 0
    isThursday = 1  if dayInWeek == 3 else 0
    isFriday = 1    if dayInWeek == 4 else 0
    isSaturday = 1  if dayInWeek == 5 else 0
    isSunday = 1    if dayInWeek == 6 else 0

    isHour0 = 1  if hourInDay == 0 else 0
    isHour1 = 1  if hourInDay == 1 else 0
    isHour2 = 1  if hourInDay == 2 else 0
    isHour3 = 1  if hourInDay == 3 else 0
    isHour4 = 1  if hourInDay == 4 else 0
    isHour5 = 1  if hourInDay == 5 else 0
    isHour6 = 1  if hourInDay == 6 else 0
    isHour7 = 1  if hourInDay == 7 else 0
    isHour8 = 1  if hourInDay == 8 else 0
    isHour9 = 1  if hourInDay == 9 else 0
    isHour10 = 1 if hourInDay == 10 else 0
    isHour11 = 1 if hourInDay == 11 else 0
    isHour12 = 1 if hourInDay == 12 else 0
    isHour13 = 1 if hourInDay == 13 else 0
    isHour14 = 1 if hourInDay == 14 else 0
    isHour15 = 1 if hourInDay == 15 else 0
    isHour16 = 1 if hourInDay == 16 else 0
    isHour17 = 1 if hourInDay == 17 else 0
    isHour18 = 1 if hourInDay == 18 else 0
    isHour19 = 1 if hourInDay == 19 else 0
    isHour20 = 1 if hourInDay == 20 else 0
    isHour21 = 1 if hourInDay == 21 else 0
    isHour22 = 1 if hourInDay == 22 else 0
    isHour23 = 1 if hourInDay == 23 else 0

    # Generate input for each hour in a a week
    hourWeek = ""
    for d in range(0,7):
        for h in range(0,24):
            if d > 0 or h > 0:
                hourWeek += "\t"
            if d == dayInWeek and h == hourInDay:
                hourWeek += "1"
            else:
                hourWeek += "0"

    # Holidays in 2016: May 16, Jul 14 and Aug 15
    isHoliday = 0
    if ((dt.month == 5 and dt.day == 16) or
        (dt.month == 7 and dt.day == 14) or
        (dt.month == 8 and dt.day == 15)):
        isHoliday = 1

    # Print the data line
    # Total number of values = 42 + 7*24 = 210
    #      1   2   3   4   5   6     7     8   9     10
    #      11  12  13  14  15  16  17  18  19  20
    #      21  22  23  24  25  26  27  28  29  30
    #      31  32  33  34  35  36  37  38  39  40
    #      41  42  43
    fmt = "%s\t%s\t%s\t%s\t%s\t%.8f\t%.8f\t%s\t%.8f\t%.8f\t" + \
          "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
          "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
          "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
          "%s\t%s\t%s"
    print(fmt % 
          (timeStamp, dateStr, cnt, ordinalFrac, dayInWeek, weekdaySin, weekdayCos, hourInDay, hourSin, hourCos,
           isMonday, isTuesday, isWednesday, isThursday, isFriday, isSaturday, isSunday, isHour0, isHour1, isHour2,
           isHour3, isHour4, isHour5, isHour6, isHour7, isHour8, isHour9, isHour10, isHour11, isHour12,
           isHour13, isHour14, isHour15, isHour16, isHour17, isHour18, isHour19, isHour20, isHour21, isHour22,
           isHour23, isHoliday, hourWeek),
          file=r)
#end process


# Header contains titles for the prediction input data columns
def header(r):
    hourWeekTitle = ""
    for d in range(0,7):
        for h in range(0,24):
            if d > 0 or h > 0:
                hourWeekTitle += "\t"
            hourWeekTitle += "H_" + str(d) + "_" + str(h)
    # Print the header line
    fmt = "%s\t" * 42 + "%s"
    print(fmt % (
          "timeStamp","dateStr","cnt","dateFrac","dayInWeek","weekdaySin","weekdayCos","hourInDay","hourSin","hourCos",
          "isMonday","isTuesday","isWednesday","isThursday","isFriday","isSaturday","isSunday","isHour0","isHour1","isHour2",
          "isHour3","isHour4","isHour5","isHour6","isHour7","isHour8","isHour9","isHour10","isHour11","isHour12",
          "isHour13","isHour14","isHour15","isHour16","isHour17","isHour18","isHour19","isHour20","isHour21","isHour22",
          "isHour23","isHoliday",hourWeekTitle),
          file=r
    )
#end header

# Generate feature set for the actual input file
def genFeatureSet(inputFileName,resultFileName):
    linecnt = 0

    # Simply read file line by line, skip the header line
    with open(inputFileName) as f, open(resultFileName,"w") as r:
        for line in f:
            line = line.strip()
            linecnt = linecnt + 1
            header(r) if linecnt == 1 else process(r,line)
#end gen_fearure_set

Generate Training and Test feature data files


In [ ]:
# Start
trainInputFile = "train_input.txt"
trainResultFile = "train_data.txt"

testInputFile = "test_input.txt"
testResultFile = "test_data.txt"

genFeatureSet(trainInputFile,trainResultFile)
genFeatureSet(testInputFile,testResultFile)

Load Training Data


In [ ]:
# Load the training data from file generated above using correct data types
dataSet = np.genfromtxt(
    trainResultFile,
    delimiter='\t',
    names=True,
    dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int
    )
)

Examine Training Data


In [ ]:
# Get number of rows, columns, column names and sample rows
print("Shape:", dataSet.shape)
print("Columns:", len(dataSet.dtype.names))
print(dataSet.dtype.names)
print(dataSet[1:5])

Data Visualization

We are using "plotly" module for Python. It allows generating charts online via cloud service and offline.


In [ ]:
# Generate training data chart using plotly
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)

# Prepare chart data
set1 = go.Bar(
    x=dataSet["dateFrac"],
    y=dataSet["cnt"],
    name="Cnt")
barData = [set1]

# Use offline mode to display charts
barLayout = go.Layout(barmode='group', title="Training Data")
fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)

In [ ]: