Predict Performance Using Test Data


In [ ]:
#######################################################
# Script:
#    testPerf.py
# Usage:
#    python testPerf.py <input_file> <output_file>
# Description:
#    Get the prediction based on training data model
#    Pass 1: prediction based on hours in a week
# Authors:
#    Jasmin Nakic, jnakic@salesforce.com
#    Samir Pilipovic, spilipovic@salesforce.com
#######################################################

import sys
import numpy as np
from sklearn import linear_model
from sklearn.externals import joblib

# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [ ]:
# Script debugging flag
debugFlag = False

# Feature lists for different models
simpleCols = ["dateFrac"]
trigCols = ["dateFrac", "weekdaySin", "weekdayCos", "hourSin", "hourCos"]
hourDayCols  = ["dateFrac", "isMonday", "isTuesday", "isWednesday", "isThursday", "isFriday", "isSaturday", "isSunday",
                "isHour0", "isHour1", "isHour2", "isHour3", "isHour4", "isHour5", "isHour6", "isHour7",
                "isHour8", "isHour9", "isHour10", "isHour11", "isHour12", "isHour13", "isHour14", "isHour15",
                "isHour16", "isHour17", "isHour18", "isHour19", "isHour20", "isHour21", "isHour22", "isHour23"]
hourWeekCols = ["dateFrac"]
for d in range(0,7):
    for h in range(0,24):
        hourWeekCols.append("H_" + str(d) + "_" + str(h))

In [ ]:
# Add columns to the existing array and populate with data
def addColumns(dest, src, colNames):
    # Initialize temporary array
    tmpArr = np.empty(src.shape[0])
    cols = 0
    # Copy column content
    for name in colNames:
        if cols == 0: # first column
            tmpArr = np.copy(src[name])
            tmpArr = np.reshape(tmpArr,(-1,1))
        else:
            tmpCol = np.copy(src[name])
            tmpCol = np.reshape(tmpCol,(-1,1))
            tmpArr = np.append(tmpArr,tmpCol,1)
        cols = cols + 1
    return np.append(dest,tmpArr,1)
#end addColumns

# Get prediction using saved linear regression model
def getPredictions(data,colList,modelName):
    # Initialize array
    X = np.zeros(data.shape[0])
    X = np.reshape(X,(-1,1))

    # Add columns
    X = addColumns(X,data,colList)

    if debugFlag:
        print("X 0: ", X[0:5])

    Y = np.copy(data["cnt"])
    if debugFlag:
        print("Y 0: ", Y[0:5])

    model = joblib.load(modelName)
    P = model.predict(X)
    print("SCORE values: ", model.score(X,Y))
    if debugFlag:
        print("P 0-5: ", P[0:5])

    return P
#end getPredictions

# Write predictions to the output file
def writeResult(output,data,p1,p2,p3,p4):
    # generate result file
    result = np.array(
        np.empty(data.shape[0]),
        dtype=[
            ("timeStamp","|S19"),
            ("dateFrac",float),
            ("isHoliday",int),
            ("isSunday",int),
            ("cnt",int),
            ("predSimple",int),
            ("predTrig",int),
            ("predHourDay",int),
            ("predHourWeek",int)
        ]
    )

    result["timeStamp"]    = data["timeStamp"]
    result["dateFrac"]     = data["dateFrac"]
    result["isHoliday"]    = data["isHoliday"]
    result["isSunday"]     = data["isSunday"]
    result["cnt"]          = data["cnt"]
    result["predSimple"]   = p1
    result["predTrig"]     = p2
    result["predHourDay"]  = p3
    result["predHourWeek"] = p4

    if debugFlag:
        print("R 0-5: ", result[0:5])
    hdr = "timeStamp\tdateFrac\tisHoliday\tisSunday\tcnt\tpredSimple\tpredTrig\tpredHourDay\tpredHourWeek"
    np.savetxt(output,result,fmt="%s",delimiter="\t",header=hdr,comments="")
#end writeResult

In [ ]:
# Start
inputFileName = "test_data.txt"
outputFileName = "test_hourly.txt"

# All input columns - data types are strings, float and int
inputData = np.genfromtxt(
    inputFileName,
    delimiter='\t',
    names=True,
    dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int,
           int,int,int,int,int,int,int,int,int,int
    )
)

P1 = getPredictions(inputData,simpleCols,"modelSimple")
P2 = getPredictions(inputData,trigCols,"modelTrig")
P3 = getPredictions(inputData,hourDayCols,"modelHourDay")
P4 = getPredictions(inputData,hourWeekCols,"modelHourWeek")

writeResult(outputFileName,inputData,P1,P2,P3,P4)

In [ ]:
# Load the test data from file generated above using correct data types
results = np.genfromtxt(
    outputFileName,
    dtype=("|U19",float,int,int,int,int,int,int,int),
    delimiter='\t',
    names=True
)

In [ ]:
# Examine results
print("Shape:", results.shape)
print("Columns:", len(results.dtype.names))
print(results[1:5])

In [ ]:
# Generate chart with predicitons based on test data (using plotly)
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)

set1 = go.Bar(
    x=results["dateFrac"],
    y=results["cnt"],
#    marker=dict(color='blue'),
    name='Actual'
)
set2 = go.Bar(
    x=results["dateFrac"],
    y=results["predHourWeek"],
#    marker=dict(color='crimson'),
    opacity=0.6,
    name='Prediction'
)
barData = [set1, set2]
barLayout = go.Layout(barmode='group', title="Prediction vs. Actual")

fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)

In [ ]: