In [ ]:
##################################################################
# Script:
# testHoliday.py
# Usage:
# python testHoliday.py <input_file> <pass1_file> <output_file>
# Description:
# Get predictions based on training data model
# Pass 2: prediction based on holiday info
# Authors:
# Jasmin Nakic, jnakic@salesforce.com
# Samir Pilipovic, spilipovic@salesforce.com
##################################################################
import sys
import numpy as np
from sklearn import linear_model
from sklearn.externals import joblib
# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [ ]:
# Script debugging flag
debugFlag = False
# Feature list for holiday hours
hourHolidayCols = ["isHoliday",
"isHour0", "isHour1", "isHour2", "isHour3", "isHour4", "isHour5", "isHour6", "isHour7",
"isHour8", "isHour9", "isHour10", "isHour11", "isHour12", "isHour13", "isHour14", "isHour15",
"isHour16", "isHour17", "isHour18", "isHour19", "isHour20", "isHour21", "isHour22", "isHour23"]
In [ ]:
# Add columns to the existing array and populate with data
def addColumns(dest, src, colNames):
# Initialize temporary array
tmpArr = np.empty(src.shape[0])
cols = 0
# Copy column content
for name in colNames:
if cols == 0: # first column
tmpArr = np.copy(src[name])
tmpArr = np.reshape(tmpArr,(-1,1))
else:
tmpCol = np.copy(src[name])
tmpCol = np.reshape(tmpCol,(-1,1))
tmpArr = np.append(tmpArr,tmpCol,1)
cols = cols + 1
return np.append(dest,tmpArr,1)
#end addColumns
# Get prediction using saved linear regression model
def getPredictions(rawData,calcData,modelName):
# Initialize array
X = np.zeros(rawData.shape[0])
X = np.reshape(X,(-1,1))
# Add columns for holidays by hour
X = addColumns(X,rawData,hourHolidayCols)
X[:, 2] = rawData["isHoliday"]*rawData["isHour0"]
X[:, 3] = rawData["isHoliday"]*rawData["isHour1"]
X[:, 4] = rawData["isHoliday"]*rawData["isHour2"]
X[:, 5] = rawData["isHoliday"]*rawData["isHour3"]
X[:, 6] = rawData["isHoliday"]*rawData["isHour4"]
X[:, 7] = rawData["isHoliday"]*rawData["isHour5"]
X[:, 8] = rawData["isHoliday"]*rawData["isHour6"]
X[:, 9] = rawData["isHoliday"]*rawData["isHour7"]
X[:,10] = rawData["isHoliday"]*rawData["isHour8"]
X[:,11] = rawData["isHoliday"]*rawData["isHour9"]
X[:,12] = rawData["isHoliday"]*rawData["isHour10"]
X[:,13] = rawData["isHoliday"]*rawData["isHour11"]
X[:,14] = rawData["isHoliday"]*rawData["isHour12"]
X[:,15] = rawData["isHoliday"]*rawData["isHour13"]
X[:,16] = rawData["isHoliday"]*rawData["isHour14"]
X[:,17] = rawData["isHoliday"]*rawData["isHour15"]
X[:,18] = rawData["isHoliday"]*rawData["isHour16"]
X[:,19] = rawData["isHoliday"]*rawData["isHour17"]
X[:,20] = rawData["isHoliday"]*rawData["isHour18"]
X[:,21] = rawData["isHoliday"]*rawData["isHour19"]
X[:,22] = rawData["isHoliday"]*rawData["isHour20"]
X[:,23] = rawData["isHoliday"]*rawData["isHour21"]
X[:,24] = rawData["isHoliday"]*rawData["isHour22"]
X[:,25] = rawData["isHoliday"]*rawData["isHour23"]
Xnoholiday = np.zeros(rawData.shape[0])
Xnoholiday = (1-rawData["isHoliday"])*calcData["predHourWeek"]
Xnoholiday = np.reshape(Xnoholiday,(-1,1))
X = np.append(X,Xnoholiday,1)
if debugFlag:
print("X 0: ", X[0:5])
Y = np.copy(rawData["cnt"])
if debugFlag:
print("Y 0: ", Y[0:5])
model = joblib.load(modelName)
P = model.predict(X)
print("SCORE values: ", model.score(X,Y))
if debugFlag:
print("P 0-5: ", P[0:5])
return P
#end getPredictions
# Write predictions to the output file
def writeResult(output,rawData,calcData,p5):
# generate result file
result = np.array(
np.empty(rawData.shape[0]),
dtype=[
("timeStamp","|U19"),
("dateFrac",float),
("isHoliday",int),
("isSunday",int),
("cnt",int),
("predSimple",int),
("predTrig",int),
("predHourDay",int),
("predHourWeek",int),
("predHoliday",int)
]
)
result["timeStamp"] = rawData["timeStamp"]
result["dateFrac"] = rawData["dateFrac"]
result["isHoliday"] = rawData["isHoliday"]
result["isSunday"] = rawData["isSunday"]
result["cnt"] = rawData["cnt"]
result["predSimple"] = calcData["predSimple"]
result["predTrig"] = calcData["predTrig"]
result["predHourDay"] = calcData["predHourDay"]
result["predHourWeek"] = calcData["predHourWeek"]
result["predHoliday"] = p5
if debugFlag:
print("R 0-5: ", result[0:5])
hdr = "timeStamp\tdateFrac\tisHoliday\tisSunday\tcnt\tpredSimple\tpredTrig\tpredHourDay\tpredHourWeek\tpredHoliday"
np.savetxt(output,result,fmt="%s",delimiter="\t",header=hdr,comments="")
#end writeResult
In [ ]:
# Start
inputFileName = "test_data.txt"
hourlyFileName = "test_hourly.txt"
outputFileName = "test_holiday.txt"
# All input columns - data types are strings, float and int
inputData = np.genfromtxt(
inputFileName,
delimiter='\t',
names=True,
dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int
)
)
# timeStamp dateFrac isHoliday isSunday cnt predSimple predTrig predHourDay predHourWeek
hourlyData = np.genfromtxt(
hourlyFileName,
delimiter='\t',
names=True,
dtype=("|U19",float,int,int,int,int,int,int,int)
)
PH = getPredictions(inputData,hourlyData,"modelHoliday")
writeResult(outputFileName,inputData,hourlyData,PH)
In [ ]:
# Load results from file generated above using correct data types
results = np.genfromtxt(
outputFileName,
dtype=("|U19",float,int,int,int,int,int,int,int,int),
delimiter='\t',
names=True
)
In [ ]:
# Examine result data
print("Shape:", results.shape)
print("Columns:", len(results.dtype.names))
print(results[1:5])
In [ ]:
# Generate chart with predicitons based on training data (using plotly)
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)
set1 = go.Bar(
x=results["dateFrac"],
y=results["cnt"],
# marker=dict(color='blue'),
name='Actual'
)
set2 = go.Bar(
x=results["dateFrac"],
y=results["predHoliday"],
# marker=dict(color='crimson'),
opacity=0.6,
name='Prediction'
)
barData = [set1, set2]
barLayout = go.Layout(barmode='group', title="Prediction vs. Actual")
fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)