In [ ]:
##################################################################
# Script:
#    testHoliday.py
# Usage:
#    python testHoliday.py <input_file> <pass1_file> <output_file>
# Description:
#    Get predictions based on training data model
#    Pass 2: prediction based on holiday info
# Authors:
#    Jasmin Nakic, jnakic@salesforce.com
#    Samir Pilipovic, spilipovic@salesforce.com
##################################################################

import sys
import numpy as np
from sklearn import linear_model
from sklearn.externals import joblib

# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [ ]:
# Script debugging flag
debugFlag = False

# Feature list for holiday hours
hourHolidayCols  = ["isHoliday",
                    "isHour0", "isHour1", "isHour2", "isHour3", "isHour4", "isHour5", "isHour6", "isHour7",
                    "isHour8", "isHour9", "isHour10", "isHour11", "isHour12", "isHour13", "isHour14", "isHour15",
                    "isHour16", "isHour17", "isHour18", "isHour19", "isHour20", "isHour21", "isHour22", "isHour23"]

In [ ]:
# Add columns to the existing array and populate with data
def addColumns(dest, src, colNames):
    # Initialize temporary array
    tmpArr = np.empty(src.shape[0])
    cols = 0
    # Copy column content
    for name in colNames:
        if cols == 0: # first column
            tmpArr = np.copy(src[name])
            tmpArr = np.reshape(tmpArr,(-1,1))
        else:
            tmpCol = np.copy(src[name])
            tmpCol = np.reshape(tmpCol,(-1,1))
            tmpArr = np.append(tmpArr,tmpCol,1)
        cols = cols + 1
    return np.append(dest,tmpArr,1)
#end addColumns

# Get prediction using saved linear regression model
def getPredictions(rawData,calcData,modelName):
    # Initialize array
    X = np.zeros(rawData.shape[0])
    X = np.reshape(X,(-1,1))

    # Add columns for holidays by hour
    X = addColumns(X,rawData,hourHolidayCols)

    X[:, 2] = rawData["isHoliday"]*rawData["isHour0"]
    X[:, 3] = rawData["isHoliday"]*rawData["isHour1"]
    X[:, 4] = rawData["isHoliday"]*rawData["isHour2"]
    X[:, 5] = rawData["isHoliday"]*rawData["isHour3"]
    X[:, 6] = rawData["isHoliday"]*rawData["isHour4"]
    X[:, 7] = rawData["isHoliday"]*rawData["isHour5"]
    X[:, 8] = rawData["isHoliday"]*rawData["isHour6"]
    X[:, 9] = rawData["isHoliday"]*rawData["isHour7"]
    X[:,10] = rawData["isHoliday"]*rawData["isHour8"]
    X[:,11] = rawData["isHoliday"]*rawData["isHour9"]
    X[:,12] = rawData["isHoliday"]*rawData["isHour10"]
    X[:,13] = rawData["isHoliday"]*rawData["isHour11"]
    X[:,14] = rawData["isHoliday"]*rawData["isHour12"]
    X[:,15] = rawData["isHoliday"]*rawData["isHour13"]
    X[:,16] = rawData["isHoliday"]*rawData["isHour14"]
    X[:,17] = rawData["isHoliday"]*rawData["isHour15"]
    X[:,18] = rawData["isHoliday"]*rawData["isHour16"]
    X[:,19] = rawData["isHoliday"]*rawData["isHour17"]
    X[:,20] = rawData["isHoliday"]*rawData["isHour18"]
    X[:,21] = rawData["isHoliday"]*rawData["isHour19"]
    X[:,22] = rawData["isHoliday"]*rawData["isHour20"]
    X[:,23] = rawData["isHoliday"]*rawData["isHour21"]
    X[:,24] = rawData["isHoliday"]*rawData["isHour22"]
    X[:,25] = rawData["isHoliday"]*rawData["isHour23"]

    Xnoholiday = np.zeros(rawData.shape[0])
    Xnoholiday = (1-rawData["isHoliday"])*calcData["predHourWeek"]
    Xnoholiday = np.reshape(Xnoholiday,(-1,1))
    X = np.append(X,Xnoholiday,1)

    if debugFlag:
        print("X 0: ", X[0:5])

    Y = np.copy(rawData["cnt"])
    if debugFlag:
        print("Y 0: ", Y[0:5])

    model = joblib.load(modelName)
    P = model.predict(X)
    print("SCORE values: ", model.score(X,Y))
    if debugFlag:
        print("P 0-5: ", P[0:5])

    return P
#end getPredictions

# Write predictions to the output file
def writeResult(output,rawData,calcData,p5):
    # generate result file
    result = np.array(
        np.empty(rawData.shape[0]),
        dtype=[
            ("timeStamp","|U19"),
            ("dateFrac",float),
            ("isHoliday",int),
            ("isSunday",int),
            ("cnt",int),
            ("predSimple",int),
            ("predTrig",int),
            ("predHourDay",int),
            ("predHourWeek",int),
            ("predHoliday",int)
        ]
    )

    result["timeStamp"]    = rawData["timeStamp"]
    result["dateFrac"]     = rawData["dateFrac"]
    result["isHoliday"]    = rawData["isHoliday"]
    result["isSunday"]     = rawData["isSunday"]
    result["cnt"]          = rawData["cnt"]
    result["predSimple"]   = calcData["predSimple"]
    result["predTrig"]     = calcData["predTrig"]
    result["predHourDay"]  = calcData["predHourDay"]
    result["predHourWeek"] = calcData["predHourWeek"]
    result["predHoliday"]  = p5

    if debugFlag:
        print("R 0-5: ", result[0:5])
    hdr = "timeStamp\tdateFrac\tisHoliday\tisSunday\tcnt\tpredSimple\tpredTrig\tpredHourDay\tpredHourWeek\tpredHoliday"
    np.savetxt(output,result,fmt="%s",delimiter="\t",header=hdr,comments="")
#end writeResult

In [ ]:
# Start
inputFileName = "test_data.txt"
hourlyFileName = "test_hourly.txt"
outputFileName = "test_holiday.txt"

# All input columns - data types are strings, float and int
inputData = np.genfromtxt(
    inputFileName,
    delimiter='\t',
    names=True,
    dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int
    )
)

# timeStamp dateFrac isHoliday isSunday cnt predSimple predTrig predHourDay predHourWeek
hourlyData = np.genfromtxt(
    hourlyFileName,
    delimiter='\t',
    names=True,
    dtype=("|U19",float,int,int,int,int,int,int,int)
)

PH = getPredictions(inputData,hourlyData,"modelHoliday")
writeResult(outputFileName,inputData,hourlyData,PH)

In [ ]:
# Load results from file generated above using correct data types
results = np.genfromtxt(
    outputFileName,
    dtype=("|U19",float,int,int,int,int,int,int,int,int),
    delimiter='\t',
    names=True
)

In [ ]:
# Examine result data
print("Shape:", results.shape)
print("Columns:", len(results.dtype.names))
print(results[1:5])

In [ ]:
# Generate chart with predicitons based on training data (using plotly)
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)

set1 = go.Bar(
    x=results["dateFrac"],
    y=results["cnt"],
#    marker=dict(color='blue'),
    name='Actual'
)
set2 = go.Bar(
    x=results["dateFrac"],
    y=results["predHoliday"],
#    marker=dict(color='crimson'),
    opacity=0.6,
    name='Prediction'
)
barData = [set1, set2]
barLayout = go.Layout(barmode='group', title="Prediction vs. Actual")

fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)