In [ ]:
#######################################################
# Script:
# genFeatures.py
# Usage:
# python genFeatures.py <input_file>
# Description:
# Generate feature data set for performance metrics
# Authors:
# Jasmin Nakic, jnakic@salesforce.com
# Samir Pilipovic, spilipovic@salesforce.com
#######################################################
import sys
import math
import numpy as np
from datetime import datetime
# Imports required for visualization (plotly)
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [ ]:
# Calculate datetime in a fractional format
def getOrdinalFrac(dateObj):
# Return the proleptic Gregorian ordinal of the date + time as date fraction
dtFrac = dateObj.toordinal() + dateObj.hour/24.0 + dateObj.minute / 1440.0
return dtFrac
#end getOrdinalFrac
# Generate all features for a specific event
def process(r,data):
# Tab delimited file where col #1 is timestamp and col #2 is the metric value
vals = data.split('\t')
timeStamp = vals[0]
cnt = int(vals[1])
dt = datetime.strptime(timeStamp,"%y-%m-%d %H:%M")
timeStamp = "20" + timeStamp + ":00"
dateStr = dt.strftime("%Y-%m-%d")
ordinalFrac = getOrdinalFrac(dt)
dayInWeek = dt.weekday()
weekdaySin = math.sin(dayInWeek*2*math.pi/7)
weekdayCos = math.cos(dayInWeek*2*math.pi/7)
hourInDay = dt.hour
hourSin = math.sin(hourInDay*2*math.pi/24)
hourCos = math.cos(hourInDay*2*math.pi/24)
isMonday = 1 if dayInWeek == 0 else 0
isTuesday = 1 if dayInWeek == 1 else 0
isWednesday = 1 if dayInWeek == 2 else 0
isThursday = 1 if dayInWeek == 3 else 0
isFriday = 1 if dayInWeek == 4 else 0
isSaturday = 1 if dayInWeek == 5 else 0
isSunday = 1 if dayInWeek == 6 else 0
isHour0 = 1 if hourInDay == 0 else 0
isHour1 = 1 if hourInDay == 1 else 0
isHour2 = 1 if hourInDay == 2 else 0
isHour3 = 1 if hourInDay == 3 else 0
isHour4 = 1 if hourInDay == 4 else 0
isHour5 = 1 if hourInDay == 5 else 0
isHour6 = 1 if hourInDay == 6 else 0
isHour7 = 1 if hourInDay == 7 else 0
isHour8 = 1 if hourInDay == 8 else 0
isHour9 = 1 if hourInDay == 9 else 0
isHour10 = 1 if hourInDay == 10 else 0
isHour11 = 1 if hourInDay == 11 else 0
isHour12 = 1 if hourInDay == 12 else 0
isHour13 = 1 if hourInDay == 13 else 0
isHour14 = 1 if hourInDay == 14 else 0
isHour15 = 1 if hourInDay == 15 else 0
isHour16 = 1 if hourInDay == 16 else 0
isHour17 = 1 if hourInDay == 17 else 0
isHour18 = 1 if hourInDay == 18 else 0
isHour19 = 1 if hourInDay == 19 else 0
isHour20 = 1 if hourInDay == 20 else 0
isHour21 = 1 if hourInDay == 21 else 0
isHour22 = 1 if hourInDay == 22 else 0
isHour23 = 1 if hourInDay == 23 else 0
# Generate input for each hour in a a week
hourWeek = ""
for d in range(0,7):
for h in range(0,24):
if d > 0 or h > 0:
hourWeek += "\t"
if d == dayInWeek and h == hourInDay:
hourWeek += "1"
else:
hourWeek += "0"
# Holidays in 2016: May 16, Jul 14 and Aug 15
isHoliday = 0
if ((dt.month == 5 and dt.day == 16) or
(dt.month == 7 and dt.day == 14) or
(dt.month == 8 and dt.day == 15)):
isHoliday = 1
# Print the data line
# Total number of values = 42 + 7*24 = 210
# 1 2 3 4 5 6 7 8 9 10
# 11 12 13 14 15 16 17 18 19 20
# 21 22 23 24 25 26 27 28 29 30
# 31 32 33 34 35 36 37 38 39 40
# 41 42 43
fmt = "%s\t%s\t%s\t%s\t%s\t%.8f\t%.8f\t%s\t%.8f\t%.8f\t" + \
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t" + \
"%s\t%s\t%s"
print(fmt %
(timeStamp, dateStr, cnt, ordinalFrac, dayInWeek, weekdaySin, weekdayCos, hourInDay, hourSin, hourCos,
isMonday, isTuesday, isWednesday, isThursday, isFriday, isSaturday, isSunday, isHour0, isHour1, isHour2,
isHour3, isHour4, isHour5, isHour6, isHour7, isHour8, isHour9, isHour10, isHour11, isHour12,
isHour13, isHour14, isHour15, isHour16, isHour17, isHour18, isHour19, isHour20, isHour21, isHour22,
isHour23, isHoliday, hourWeek),
file=r)
#end process
# Header contains titles for the prediction input data columns
def header(r):
hourWeekTitle = ""
for d in range(0,7):
for h in range(0,24):
if d > 0 or h > 0:
hourWeekTitle += "\t"
hourWeekTitle += "H_" + str(d) + "_" + str(h)
# Print the header line
fmt = "%s\t" * 42 + "%s"
print(fmt % (
"timeStamp","dateStr","cnt","dateFrac","dayInWeek","weekdaySin","weekdayCos","hourInDay","hourSin","hourCos",
"isMonday","isTuesday","isWednesday","isThursday","isFriday","isSaturday","isSunday","isHour0","isHour1","isHour2",
"isHour3","isHour4","isHour5","isHour6","isHour7","isHour8","isHour9","isHour10","isHour11","isHour12",
"isHour13","isHour14","isHour15","isHour16","isHour17","isHour18","isHour19","isHour20","isHour21","isHour22",
"isHour23","isHoliday",hourWeekTitle),
file=r
)
#end header
# Generate feature set for the actual input file
def genFeatureSet(inputFileName,resultFileName):
linecnt = 0
# Simply read file line by line, skip the header line
with open(inputFileName) as f, open(resultFileName,"w") as r:
for line in f:
line = line.strip()
linecnt = linecnt + 1
header(r) if linecnt == 1 else process(r,line)
#end gen_fearure_set
In [ ]:
# Start
trainInputFile = "train_input.txt"
trainResultFile = "train_data.txt"
testInputFile = "test_input.txt"
testResultFile = "test_data.txt"
genFeatureSet(trainInputFile,trainResultFile)
genFeatureSet(testInputFile,testResultFile)
In [ ]:
# Load the training data from file generated above using correct data types
dataSet = np.genfromtxt(
trainResultFile,
delimiter='\t',
names=True,
dtype=("|U19","|U10",int,float,int,float,float,int,float,float,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int
)
)
In [ ]:
# Get number of rows, columns, column names and sample rows
print("Shape:", dataSet.shape)
print("Columns:", len(dataSet.dtype.names))
print(dataSet.dtype.names)
print(dataSet[1:5])
We are using "plotly" module for Python. It allows generating charts online via cloud service and offline.
In [ ]:
# Generate training data chart using plotly
print("Plotly version", __version__) # requires plotly version >= 1.9.0
init_notebook_mode(connected=True)
# Prepare chart data
set1 = go.Bar(
x=dataSet["dateFrac"],
y=dataSet["cnt"],
name="Cnt")
barData = [set1]
# Use offline mode to display charts
barLayout = go.Layout(barmode='group', title="Training Data")
fig = go.Figure(data=barData, layout=barLayout)
iplot(fig)
In [ ]: