In [5]:
from pandas.tseries.offsets import *
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import itertools
%matplotlib inline
In [6]:
path = {}
path['train'] = {'order':'./training_data_2/order_data/order_data_{}',
'weather': './training_data_2/weather_data/weather_data_{}',
'traffic': './training_data_2/traffic_data/traffic_data_{}',
'district':'./training_data_2/cluster_map/cluster_map',
'poi':'./training_data_2/poi_data/poi_data'}
path['test'] = {'order':'./test_data_2/order_data/order_data_{}_test',
'weather': './test_data_2/weather_data/weather_data_{}_test',
'traffic': './test_data_2/traffic_data/traffic_data_{}_test',
'district':'./test_data_2/cluster_map/cluster_map'}
M = np.timedelta64(1, 'm') # base time stamp of 1 minute
D_range = range(1,67) # List of all district Ids
T_range = range(1,145) # List of all time slots
test_slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
test_slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
# List of all need to prediced time slots
S_range = {'2016-01-23':test_slot1, '2016-01-25':test_slot2, '2016-01-27':test_slot1,
'2016-01-29':test_slot2, '2016-01-31':test_slot1}
# Dictionary of District Info Table
district_dict = pd.read_table(path['train']['district'], header=None, index_col=0)
district_dict = district_dict[1].to_dict()
In [7]:
print len(district_dict) == 66
In [8]:
def index(district, slot):
if type(district) is int:
if type(slot) is int:
return [x for x in itertools.product([district],[slot])]
else:
return [x for x in itertools.product([district],slot)]
else:
if type(slot) is int:
return [x for x in itertools.product(district,[slot])]
else:
return [x for x in itertools.product(district,slot)]
In [9]:
def District(df):
return df['district'].apply(lambda x: district_dict[x])
In [10]:
def Weekday(df):
return pd.to_datetime(df['time']).apply(lambda x: x.weekday())
In [11]:
def Time(df, day):
time = pd.to_datetime(df['time'])
time = (time - pd.Timestamp(day)) / M / 10 + 1
return time.astype(int)
In [12]:
def Traffic(day, option):
df = pd.read_table(path[option]['traffic'].format(day.date()), header=None,
names=['district', 'LV1', 'LV2', 'LV3', 'LV4','time'])
df['district'] = District(df)
df['weekday'] = Weekday(df)
df['time'] = Time(df, str(day.date()))
for L in ['LV{}'.format(n) for n in range(1,5)]:
df[L]=df[L].apply(lambda x: x.split(':')[1]).astype(int)
index = pd.MultiIndex.from_arrays([df['district'].values, df['time'].values], names=('district', 'time'))
return pd.DataFrame({'weekday':df['weekday'].values,
'day':day.day,
'district':df['district'].values,
'time':df['time'].values,
'LV1':df['LV1'].values,
'LV2':df['LV2'].values,
'LV3':df['LV3'].values,
'LV4':df['LV4'].values,}, index=index).sort_index()
In [13]:
# DTG: District Time Gap
def DTG(day, option):
df = pd.read_table(path[option]['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
df = df[df['driver'].isnull()]
df['district'] = District(df)
df['time'] = Time(df, day.date())
Order = df.groupby(['district', 'time'])
return pd.DataFrame({'gap':Order.size()})
In [14]:
def Weather(day, option):
df = pd.read_table(path[option]['weather'].format(day.date()), header=None,
names=['time', 'weather', 'temprature', 'pm2.5'])
df['time'] = Time(df, day.date())
df = df.drop_duplicates(subset='time')
DF = pd.DataFrame({'time': T_range}, columns=df.columns)
DF = DF.set_index('time')
DF.update(df.set_index('time'))
return DF.fillna(method='bfill').fillna(method='ffill')
In [15]:
# Dictionary of order data for testing, indexed by date, cols = [gap]
test_order = {}
# Dictionary of traffic data for testing, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
test_traffic = {}
# Dictionary of weather data for testing, indexed by date, cols = [temperature, weather, pm2.5]
test_weather = {}
for day in pd.date_range('1/23/2016', periods=5, freq='2D'):
test_order[str(day.date())] = DTG(day, 'test')
test_traffic[str(day.date())] = Traffic(day, 'test')
test_weather[str(day.date())] = Weather(day, 'test')
In [16]:
# Check data
print len(test_order.keys()) == 5
print len(test_traffic.keys()) == 5
print len(test_weather.keys()) == 5
print all(test_order['2016-01-23'].columns.values == np.array(['gap']))
print all(test_traffic['2016-01-23'].columns.values == np.array(['LV1', 'LV2', 'LV3', 'LV4', 'day', 'district', 'time', 'weekday']))
print all(test_weather['2016-01-23'].columns.values == np.array(['weather', 'temprature', 'pm2.5']))
In [21]:
# Dictionary of order data for training, indexed by date, cols = [gap]
# train_order = {}
# Dictionary of traffic data for training, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
train_traffic = {}
# Dictionary of weather data for training, indexed by date, cols = [temperature, weather, pm2.5]
train_weather = {}
for day in pd.date_range('1/1/2016', periods=21, freq='D'):
# train_order[str(day.date())] = DTG(day, 'train')
train_traffic[str(day.date())] = Traffic(day, 'train')
train_weather[str(day.date())] = Weather(day, 'train')
In [14]:
# Check data
print len(train_order.keys()) == 21
print len(train_traffic.keys()) == 21
print len(train_weather.keys()) == 21
In [36]:
# POI2 = pd.DataFrame()
# with open(path['train']['poi'], 'r') as f:
# for i, line in enumerate(f):
# interests = line.strip().split('\t')
# row = {'district': interests[0]}
# for item in interests[1:]:
# category,num = item.split(':')
# if int(category.split('#')[0]) in [4,13,16,23,24]:
# if category in row:
# row[category] += int(num)
# else:
# row[category] = int(num)
# POI2 = pd.concat( [POI2, pd.DataFrame(row, index=[i])])
# POI2['district'] = District(POI2)
# POI2 = POI2.set_index('district').sort_index()
# POI2 = POI2.fillna(0)
# POI2.to_csv('./POI2.csv', columns=POI2.columns, header=True)
POI2 = pd.read_csv('./POI2.csv', index_col='district')
In [37]:
# columns = ['district'] + range(1,26)
# POI = pd.DataFrame(columns=columns)
# with open(path['train']['poi'], 'r') as f:
# for i, line in enumerate(f):
# interests = line.strip().split('\t')
# row = {'district': interests[0]}
# for item in interests[1:]:
# category,num = item.split(':')
# category = int(category.split('#')[0])
# if category in row:
# row[category] += int(num)
# else:
# row[category] = int(num)
# POI = pd.concat( [POI, pd.DataFrame(row, index=[i], columns=columns)])
# POI['district'] = District(POI)
# POI = POI.set_index('district').sort_index()
# POI = POI.fillna(0)
# # Standardization
# POI = (POI - POI.mean()) / POI.std()
# POI.to_csv('./POI.csv', columns=POI.columns, header=True)
POI = pd.read_csv('./POI.csv', index_col='district')
In [652]:
# Mergin weather and gap data each day, and store it in a single csv
# def preprocessing1(day):
# X = train_order[day].reindex(index(D_range,T_range)).fillna(0)
# X['last'] = X['gap'].shift().fillna(method='bfill')
# X['last2'] = X['gap'].shift(2).fillna(method='bfill')
# X['GAP'] = pd.Series(index=X.index)
# X['time'] = pd.Series(index=X.index)
# X['district'] = pd.Series(index=X.index)
# for D in D_range:
# X.loc[(D, T_range), 'GAP'] = X.loc[(D, T_range),'gap'].shift(-1).fillna(method='ffill')
# X.loc[(D, T_range), 'time'] = pd.DataFrame(T_range, columns=['time'],index=index(D, T_range))
# X.loc[(D, T_range), 'district'] = pd.DataFrame([D]*144, columns=['district'],index=index(D, T_range))
# X = X.join(train_weather[day], on='time')
# Y_gap = X['GAP']
# X.drop('GAP', axis=1, inplace=True)
# return X, Y_gap
# for d in pd.date_range('1/2/2016', periods=20, freq='D'):
# X, Y_gap = preprocessing1(str(d.date()))
# X = X.join(POI2,on='district')
# X.sort_index(inplace=True)
# Y_gap.sort_index(inplace=True)
# X.to_csv('./X2/{}.csv'.format(str(d.date())), columns=X.columns, header=True)
# Y_gap.to_csv('./Y2_gap/{}.csv'.format(str(d.date())), header=True)
In [ ]:
# # from sklearn.ensemble import GradientBoostingClassifier
# # from sklearn.cross_validation import train_test_split
# without54 = range(1,67)
# without54.remove(54)
# def preprocessing2(day, Mask):
# # filling traffic of district 54 with other district
# traffic_rest = train_traffic[day].reindex(index(without54, T_range)).fillna(method='bfill')
# mask = Mask # The district replace district 54
# traffic_54 = pd.DataFrame(traffic_rest.loc[index(mask, T_range)].values,
# index=index(54, T_range), columns=traffic_rest.columns)
# return traffic_rest, traffic_54
# for D in without54:
# X_rest = []
# Y_gap_rest = []
# X_54 = []
# Y_gap_54 = []
# for d in pd.date_range('1/2/2016', periods=20, freq='D'):
# X_all = pd.read_csv('./X2/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
# Y_gap_all = pd.read_csv('./Y2_gap/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
# traffic_rest, traffic_54 = preprocessing2(str(d.date()), D)
# tempX_rest = pd.concat([X_all.loc[(without54, T_range),:], traffic_rest], axis=1)
# tempX_54 = pd.concat([X_all.loc[(54, T_range),:], traffic_54],axis=1)
# tempX_rest = tempX_rest.drop(['time.1','district.1'],1)
# tempX_54 = tempX_54.drop(['time.1','district.1'],1)
# X_rest.append(tempX_rest)
# X_54.append(tempX_54)
# Y_gap_rest.append(Y_gap_all.loc[(without54, T_range),:])
# Y_gap_54.append(Y_gap_all.loc[(54, T_range),:])
# X_rest = pd.concat(X_rest)
# X_rest.sort_index(inplace=True)
# X_54 = pd.concat(X_54)
# X_54.sort_index(inplace=True)
# Y_gap_rest = pd.concat(Y_gap_rest)
# Y_gap_rest.sort_index(inplace=True)
# Y_gap_54 = pd.concat(Y_gap_54)
# Y_gap_54.sort_index(inplace=True)
# Y_gap_rest[Y_gap_rest>10]=11
# Y_gap_54[Y_gap_54>10]=11
# columns = ['gap', 'last', 'last2', 'LV1', 'LV2', 'LV3', 'LV4', 'district', 'time', 'pm2.5']
# X_train, X_test, y_train, y_test = train_test_split(X_rest[columns], Y_gap_rest, test_size=0.4)
# params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 20, 'min_samples_leaf':10, 'min_samples_split':100,
# 'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
# 'verbose':0}
# grd = GradientBoostingClassifier(**params)
# grd.fit(X_train[columns], y_train['GAP'])
# testX = X_54[Y_gap_54['GAP']>0][columns]
# testY = Y_gap_54[Y_gap_54['GAP']>0]['GAP']
# print "Replaced by disctric {} : {:.8f}".format(D,((testY - grd.predict(testX)).abs() / testY).sum() / testY.shape[0])
In [ ]:
# Replaced by disctric 17 : 0.98912147
# Replaced by disctric 18 : 0.98698988
# Replaced by disctric 19 : 0.77650698
# Replaced by disctric 20 : 0.72371558 -> 3rd
# Replaced by disctric 21 : 0.90578814
# Replaced by disctric 22 : 0.85776423
# Replaced by disctric 23 : 3.11777664
# Replaced by disctric 24 : 0.59887103 -> 1st
# Replaced by disctric 25 : 0.74924907
# Replaced by disctric 26 : 0.98615067
# Replaced by disctric 27 : 0.80867298
# Replaced by disctric 28 : 0.61996468 -> 2nd
# Replaced by disctric 29 : 0.85480506
# Replaced by disctric 30 : 0.98992951
# Replaced by disctric 31 : 0.97505635
# Replaced by disctric 32 : 1.00273342
# Replaced by disctric 33 : 0.99017887
# Replaced by disctric 34 : 0.98325421
# Replaced by disctric 35 : 0.99271328
# Replaced by disctric 36 : 0.99436532
# Replaced by disctric 37 : 1.04983252
# Replaced by disctric 38 : 0.97790965
# Replaced by disctric 39 : 0.99463147
# Replaced by disctric 40 : 0.99509663
# Replaced by disctric 41 : 0.98831103
# Replaced by disctric 42 : 0.74919821
# Replaced by disctric 43 : 0.99647533
# Replaced by disctric 44 : 0.99103486
# Replaced by disctric 45 : 0.98905673
# Replaced by disctric 46 : 0.90203604
# Replaced by disctric 47 : 0.97719033
# Replaced by disctric 48 : 1.65585771
# Replaced by disctric 49 : 0.99523330
# Replaced by disctric 50 : 0.99402964
# Replaced by disctric 51 : 3.39464397
# Replaced by disctric 52 : 0.99622356
# Replaced by disctric 53 : 0.99042104
# Replaced by disctric 55 : 0.99966432
# Replaced by disctric 56 : 0.99899295
# Replaced by disctric 57 : 0.98358989
# Replaced by disctric 58 : 0.99629550
# Replaced by disctric 59 : 0.99899295
# Replaced by disctric 60 : 0.99699324
# Replaced by disctric 61 : 0.99699324
# Replaced by disctric 62 : 1.00290126
# Replaced by disctric 63 : 0.99707956
# Replaced by disctric 64 : 0.99622356
# Replaced by disctric 65 : 0.99874119
# Replaced by disctric 66 : 0.99471299
In [23]:
# Filling traffic of district 54 with other district
def replace54(day, mask):
without54 = range(1,67)
without54.remove(54)
traffic_rest = train_traffic[day].reindex(index(without54, T_range)).fillna(method='bfill')
traffic_54 = pd.DataFrame(traffic_rest.loc[index(mask, T_range)].values,
index=index(54, T_range), columns=traffic_rest.columns)
return traffic_rest, traffic_54
In [18]:
def Masking(Mask):
# Mask is the district replacing district 54
without54 = range(1,67)
without54.remove(54)
X = []
Y = []
for d in pd.date_range('1/2/2016', periods=20, freq='D'):
X_all = pd.read_csv('./X2/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
Y_all = pd.read_csv('./Y2_gap/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
traffic_rest, traffic_54 = replace54(str(d.date()), Mask)
tempX_rest = pd.concat([X_all.loc[(without54, T_range),:], traffic_rest], axis=1)
tempX_rest = tempX_rest.drop(['time.1','district.1'],1)
tempX_54 = pd.concat([X_all.loc[(54, T_range),:], traffic_54],axis=1)
tempX_54 = tempX_54.drop(['time.1','district.1'],1)
tempX_all = pd.concat([tempX_rest, tempX_54]).sort_index()
X.append(tempX_all)
tempY_rest = Y_all.loc[(without54, T_range),:]
tempY_54 = Y_all.loc[(54, T_range),:]
tempY_all = pd.concat([tempY_rest, tempY_54]).sort_index()
Y.append(tempY_all)
X = pd.concat(X)
Y = pd.concat(Y)
return X, Y
In [ ]:
In [769]:
# Best District : 9
# without54 = range(1,67)
# without54.remove(54)
# for D in without54:
# X, Y = Masking(D)
# newY = Y.copy()
# newX = X.copy()
# newX = newX[newY['GAP']>0]
# newX = newX.drop('last2',axis=1)
# newY = newY[newY['GAP']>0]
# newY[newY['GAP']>39]=40
# class_weight = dict(zip(range(1,38), range(38,1,-1)))
# columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
# 'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
# from sklearn.ensemble import RandomForestClassifier
# params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
# 'max_depth': 9, 'max_features': 0.85, 'random_state':1,
# 'verbose':0, 'class_weight': class_weight}
# rfc = RandomForestClassifier(**params)
# rfc.fit(newX[columns], newY['GAP'])
# print "Replaced by disctric {} : {:.8f}".format(D, score_on_test_data(rfc, columns))
In [ ]:
# DF = pd.DataFrame(columns=range(1,12))
# for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
# df = train_order[str(d.date())]
# df = df.reindex(index(D_range, T_range)).fillna(0)
# for D in D_range:
# df.loc[(D, T_range),'diff'] = df.loc[(D, T_range),'gap'].diff().shift(-1).fillna(0)
# row = pd.DataFrame(df['diff'].value_counts().sort_values(ascending=False).iloc[:11].index.values.reshape((1,11)),
# columns=range(1,12), index=[d.day])
# DF = DF.append(row)
# print DF
In [ ]:
# DF = pd.DataFrame(columns=range(1,12))
# for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
# df = train_order[str(d.date())]
# row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
# columns=range(1,12), index=[d.day])
# DF = DF.append(row)
In [ ]:
# for d in pd.date_range('1/23/2016', periods=5, freq='2D'):
# df = test_order[str(d.date())]
# row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
# columns=range(1,12), index=[d.day])
# DF = DF.append(row)
In [ ]:
# columns is the top 11 most common gap
# index is each day
# print DF
In [30]:
def score_GBD(day, pred):
ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
ans = ans['gap'].values
pred = pred[ans>0]
ans = ans[ans>0]
gap = (ans - pred) / ans
return np.fabs(gap).sum()/ans.shape[0]
In [31]:
def score_on_test_data(clf, columns):
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
if len(columns) == 0:
score = score_GBD(day, clf.predict(x))
else:
score = score_GBD(day, clf.predict(x[columns]))
scores.append(score * x.shape[0])
# print '\t score: {}'.format(score)
return np.array(scores).sum()/2838
In [32]:
def select_last_points(day, slot):
slot = np.array(slot)
x = test_order[day].reindex(index(D_range, slot)).fillna(0)
x['last'] = test_order[day].reindex(index(D_range, slot-1)).fillna(0)
x = pd.concat([x, test_traffic[day].reindex(index(D_range, slot))],axis=1)
# For missing traffic data on district 54, replaced by Mask
for t in slot:
x.loc[(54,t)]['LV1':'weekday'] = x.loc[(Mask,t)]['LV1':'weekday']
x = x.join(test_weather[day], on='time')
x = x.join(POI2,on='district')
# print "Select data from {} on {}".format(day, slot)
# print "\t shape: {}".format(x.shape)
return x
In [34]:
Mask = 9
X, Y = Masking(Mask)
In [409]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newX.drop('last2',1), newY['GAP'], test_size=0.3)
In [599]:
class_weight = dict(zip(range(1,44), range(44,1,-1)))
from sklearn.ensemble import ExtraTreesClassifier
params = {'criterion': 'entropy', 'n_estimators': 30, 'min_samples_leaf':1, 'min_samples_split':2,
'max_depth': 11, 'max_features': 0.9, 'random_state':1,
'verbose':0, 'class_weight': class_weight}
etc = ExtraTreesClassifier(**params)
etc.fit(newX[columns], newY['GAP'])
Out[599]:
In [600]:
score_on_test_data(etc, columns)
In [816]:
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newX = newX.drop('last2',axis=1)
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40
In [772]:
class_weight = dict(zip(range(1,38), range(38,1,-1)))
# columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
# 'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
from sklearn.ensemble import RandomForestClassifier
params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
'max_depth': 9, 'max_features': 0.85, 'random_state':1,
'verbose':0, 'class_weight': class_weight}
rfc = RandomForestClassifier(**params)
rfc.fit(newX, newY['GAP'])
# rfc.fit(newX[columns], etc.predict(newX[columns]))
Out[772]:
In [828]:
# print rfc.feature_importances_
columns = newX.columns[[i for i, important in enumerate(rfc.feature_importances_ > 0.01) if important]].values
print columns
In [26]:
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newX = newX.drop('last2',axis=1)
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(newX[columns], newY['GAP'], test_size=0.7)
In [50]:
columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
from sklearn.ensemble import RandomForestClassifier
class_weight = dict(zip(range(1,38), range(38,1,-1)))
params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
'max_depth': 9, 'max_features': 0.85, 'random_state':1,
'verbose':0, 'class_weight': class_weight}
rfc1 = RandomForestClassifier(**params)
rfc1.fit(newX[columns], newY['GAP'])
print score_on_test_data(rfc1, columns)
In [832]:
# Best District : 9
# without54 = range(1,67)
# without54.remove(54)
# for D in [9,11,19,20,24,28,29,36,37,59]:
# X, Y = Masking(D)
# newY = Y.copy()
# newX = X.copy()
# newX = newX[newY['GAP']>0]
# newX = newX.drop('last2',axis=1)
# newY = newY[newY['GAP']>0]
# newY[newY['GAP']>39]=40
# class_weight = dict(zip(range(1,38), range(38,1,-1)))
# columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
# 'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
# from sklearn.ensemble import RandomForestClassifier
# params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
# 'max_depth': 9, 'max_features': 0.85, 'random_state':1,
# 'verbose':0, 'class_weight': class_weight}
# rfc = RandomForestClassifier(**params)
# rfc.fit(newX[columns], newY['GAP'])
# print "Replaced by disctric {} : {:.8f}".format(D, score_on_test_data(rfc, columns))
In [116]:
from sklearn.ensemble import GradientBoostingRegressor
params = {'loss': 'quantile', 'alpha': 0.85, 'n_estimators':100, 'max_features':0.85, 'random_state':1,
'min_samples_split':10,'max_depth': 10, 'learning_rate': 0.08, 'subsample': 0.9, 'verbose':0}
Regr = GradientBoostingRegressor(**params)
Regr.fit(newX[columns], rfc1.predict(newX[columns]))
print score_on_test_data(Regr, columns)
In [109]:
def write5(x, day, slot, rfc, mode):
with open('ans5_v3.csv', mode) as f:
writer = csv.writer(f, delimiter=',')
for D in D_range:
for S in slot:
gap = rfc.predict(x.loc[(D,S)].reshape(1, -1))[0]
writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])
for day in S_range.keys():
x = select_last_points(day, S_range[day])
write5(x[columns], day, S_range[day], Regr, 'a')
In [117]:
from sklearn.ensemble import AdaBoostRegressor
params = {'base_estimator':Regr,'n_estimators':5, 'learning_rate':1.0, 'random_state':1, 'loss':'square'}
adbr = AdaBoostRegressor(**params)
adbr.fit(newX[columns], newY['GAP'])
print score_on_test_data(adbr, columns)
In [91]:
from sklearn.ensemble import BaggingRegressor
params = {'base_estimator':Regr, 'verbose':1,'n_estimators':5}
bgr = BaggingRegressor(**params)
bgr.fit(newX[columns], Regr.predict(newX[columns]))
print score_on_test_data(bgr, columns)
In [737]:
from sklearn.ensemble import GradientBoostingClassifier
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
'verbose':0}
grd = GradientBoostingClassifier(**params)
grd.fit(newX[columns], newY['GAP'])
print score_on_test_data(grd, columns)
Out[737]:
In [628]:
from sklearn.ensemble import BaggingClassifier
params = {'base_estimator':grd, 'verbose':1,'n_estimators':5}
bgc = BaggingClassifier(**params)
bgc.fit(newX[columns], grd.predict(newX[columns]))
Out[628]:
In [629]:
score_on_test_data(bgc, columns)
In [624]:
score_on_test_data(Regr, columns)
In [ ]:
In [525]:
# Select important columns
# columns = X_train.columns[[i for i, important in enumerate(rfc.feature_importances_ > 0.001) if important]].values
columns = ['gap', 'last', '4', '13', '16', '23', '24', 'LV1', 'LV2', 'LV3', 'time']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newX[columns], newY['GAP'], test_size=0.3)
In [526]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import grid_search
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
parameters = {'min_samples_split': np.arange(60, 140, 20)}
# params = {'criterion': 'entropy', 'n_estimators':40, 'min_samples_leaf':10, 'min_samples_split':60,
# 'max_depth':9, 'max_features':0.85, 'min_samples_split': 20, 'random_state':1,
# 'verbose':0, 'class_weight': dict(zip(range(1,20), range(20,1,-1)))}
# rfc = GridSearchCV(RandomForestClassifier(**params), parameters, scoring=scoring_function)
rfc = RandomForestClassifier(**params)
rfc.fit(X_train, y_train)
Out[526]:
In [528]:
print ((y_test - rfc.predict(X_test)).abs() / y_test).sum() / y_test.shape[0]
In [351]:
rfc.grid_scores_
Out[351]:
In [352]:
rfc.best_params_
Out[352]:
In [353]:
rfc = rfc.best_estimator_
In [390]:
final_columns = columns + ['last2']
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40
In [404]:
# params = {'criterion': 'entropy', 'n_estimators':40, 'min_samples_leaf':10, 'min_samples_split':100,
# 'max_depth':9, 'max_features':0.85, 'min_samples_split': 20, 'random_state':1,
# 'verbose':0, 'class_weight': dict(zip(range(1,20), range(20,1,-1)))}
rfc = RandomForestClassifier(**params)
rfc.fit(newX[columns], newY['GAP'])
Out[404]:
In [531]:
def select_last_points_ans(day, slot):
slot = np.array(slot)
x = test_order[day].reindex(index(D_range, slot)).fillna(0)
x['last'] = test_order[day].reindex(index(D_range, slot-1)).fillna(0)
x['last2'] = test_order[day].reindex(index(D_range, slot-2)).fillna(0)
x = pd.concat([x, test_traffic[day].reindex(index(D_range, slot))],axis=1)
# For missing traffic data on district 54, replaced by Mask
for t in slot:
x.loc[(54,t)]['LV1':'weekday'] = x.loc[(Mask,t)]['LV1':'weekday']
x = x.join(test_weather[day], on='time')
x = x.join(POI,on='district')
print "Select data from {} on {}".format(day, slot)
print "\t shape: {}".format(x.shape)
return x
In [532]:
def write5(x, day, slot, rfc, mode):
with open('ans5_v2.csv', mode) as f:
writer = csv.writer(f, delimiter=',')
for D in D_range:
for S in slot:
gap = rfc.predict(x.loc[(D,S)].reshape(1, -1))[0]
writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])
In [534]:
for day in S_range.keys():
x = select_last_points_ans(day, S_range[day])
write5(x[columns], day, S_range[day], rfc, 'a')
In [ ]:
from sklearn.ensemble import GradientBoostingClassifier
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
'verbose':1}
grd = GradientBoostingClassifier(**params)
grd.fit(X_train,y_train)
In [200]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
y_pred = grd.predict(x[columns])
score = score_GBD(day, y_pred)
scores.append(score * x.shape[0])
print '\t score: {}'.format(score)
print np.array(scores).sum()/2838
In [ ]:
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1}
clf = GradientBoostingClassifier(**params)
clf.fit(X,Y)
In [ ]:
columns = []
for i, important in enumerate(clf.feature_importances_ > 0.02):
if important:
columns.append(X.columns[i])
In [ ]:
params = {'loss': 'deviance', 'learning_rate': 0.15, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
'verbose':1}
clf = GradientBoostingClassifier(**params)
clf.fit(X[columns],Y)
In [ ]:
# learning rate: 0.1 -> 0.05 : 0.498091376721 -> 0.50989613439
# learning rate: 0.1 -> 0.15 : 0.498091376721 -> 0.50989613439
get_score(clf, columns)
In [ ]:
def write4(x, day, slot, clf, mode):
with open('ans4_v1.csv', mode) as f:
writer = csv.writer(f, delimiter=',')
for D in D_range:
for S in slot:
if D is 54:
df = test_order[day].reindex(index(D_range,T_range)).fillna(0)['gap']
gap = 0.5 * df.loc[(D, S)] + 0.3 * df.loc[(D, S-1)] + 0.2 * df.loc[(D, S-2)]
else:
gap = clf.predict(x.loc[(D,S)].reshape(1, -1))[0]
gap = 0 if gap < 0 else gap
writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])
In [ ]:
SLOT1 = range(45,153,12)
SLOT2 = range(57,153,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
write4(x[columns], day, S_range[day], clf, 'a')
In [ ]:
def score5(day, pred):
Ds = range(1,67)
Ds.reomve(54)
ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
ans = ans['gap'].values
pred = pred[ans>0]
ans = ans[ans>0]
gap = (ans - pred) / ans
# For district 54
another = test_order[day].reindex(index(54, S_range[day])).fillna(0)
return np.fabs(gap).sum()/ans.shape[0]
def get_score(clf, columns):
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
score = score5(day, clf.predict(x[columns].dropna()))
scores.append(score * x.shape[0])
print '\t score: {}'.format(score)
print np.array(scores).sum()/2838
In [ ]:
Y = Y_gap[Y_gap>0]
X = X[Y_gap>0]
Y[Y>12]=13
In [ ]:
# columns = ['gap', 'last', 'LV1', 'LV2', 'LV3', 'LV4', 'district', 'time', 'pm2.5']
new_columns = columns + ['last2']
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X[columns], Y, test_size=0.7)
params = {'loss': 'deviance', 'learning_rate': 0.15, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
'verbose':1}
grd = GradientBoostingClassifier(**params)
grd.fit(X_train,y_train)
In [ ]:
x = select_last_points(day, RANGE['2016-01-23'])
print x[columns].loc[(54,RANGE['2016-01-23']),:]
#print get_score(grd, columns)
# print ((y_train_lr - grd.predict(X_train_lr)).abs() / y_train_lr).sum() / (66*y_train_lr.shape[0])
In [ ]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
enc = OneHotEncoder()
lm_params = {'class_weight':{1:16,2:14,3:12,4:10,5:8}, 'solver':'newton-cg', 'random_state':1,
'penalty':'l2'}
lm = LogisticRegression(**lm_params)
enc.fit(grd.apply(X_train)[:, :, 0])
lm.fit(enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
# lm.fit(X_train_lr, y_train_lr)
In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
y_pred_grd_lm = lm.predict_proba(enc.transform(grd.apply(x[columns])[:, :, 0]))[:, 1]
score = score3(day, pd.Series(y_pred_grd_lm, index=x.index))
scores.append(score * x.shape[0])
print '\t score: {}'.format(score)
print np.array(scores).sum()/2838
In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
ans = pd.DataFrame()
scores = pd.DataFrame()
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
y_pred_grd_lm = lm.predict_proba(enc.transform(grd.apply(x[columns])[:, :, 0]))[:, 1]
a, s = score4(day, pd.Series(y_pred_grd_lm, index=x.index))
ans = pd.concat([ans, a], axis=1)
scores = pd.concat([scores, s], axis=1)
In [ ]:
worst = scores[(scores>=0.8).any(1)]
print ans.loc[worst.index]['2016-01-23'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-25'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-27'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-29'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-31'].value_counts()[:10]
In [ ]:
def score4(day, pred):
a = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
ans = a['gap'].values
pred = pred[ans>0]
ans = ans[ans>0]
gap = (ans - pred) / ans
temp = pd.DataFrame(ans, index=pred.index,columns=[day])
return temp, pd.DataFrame(np.fabs(gap),columns=[day])
In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
ans = pd.DataFrame()
scores = pd.DataFrame()
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
a, s = score4(day, pd.Series(clf.predict(x[columns]), index=x.index))
ans = pd.concat([ans, a], axis=1)
scores = pd.concat([scores, s], axis=1)
In [ ]:
worst = scores[(scores>=0.8).any(1)]
print ans.loc[worst.index]['2016-01-23'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-25'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-27'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-29'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-31'].value_counts()[:10]
In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
S_range = {'2016-01-23':slot1, '2016-01-25':slot2, '2016-01-27':slot1, '2016-01-29':slot2, '2016-01-31':slot1}
In [ ]:
def write3(x, day, slot, regr, mode):
with open('ans3_v1.csv', mode) as f:
writer = csv.writer(f, delimiter=',')
for D in D_range:
for S in slot:
key = (D,S)
if key in x.index:
gap = x['gap'].loc[key] + regr.predict(x.loc[key].reshape(1, -1))[0]
else:
gap = 1
gap = 0 if gap < 0 else gap
writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])
In [ ]:
# ans3_v1.csv
# params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':250, 'max_features':0.5, 'random_state':1,
# 'warm_start':False,'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}
#-----brand new turing
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':20, 'max_features':1.0, 'random_state':1,
'warm_start':False,'max_depth': 10, 'learning_rate': 0.25, 'subsample': 0.85,
'min_samples_leaf': 25, 'min_samples_split':100}
# Paramters need to be searched
parameters = {'n_estimators': np.arange(100,600,150)}
In [ ]:
# Searching best parameters
# scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
# regr = grid_search.GridSearchCV(GradientBoostingRegressor(**params),
# param_grid=parameters, scoring=scoring_function, cv=3)
# regr.fit(X, Y)
# Regr = regr.best_estimator_
from sklearn.ensemble import GradientBoostingRegressor
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':250, 'max_features':0.5, 'random_state':1,
'warm_start':False,'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}
Regr = GradientBoostingRegressor(**params)
Regr.fit(X, Y)
print Regr
In [ ]:
# max_features: sqrt -> 0.5 : 0.538287 -> 0.528706599387
# max_features: 0.5 -> 0.8 : 0.528706599387 -> 0.530272920336
#-----brand new turing
# alpha: 0.9 -> 0.6 : 0.537699383041 -> 0.576867122284
# learning_rate: 0.2 : 0.536744595218
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
score = score3(day, x['gap']+Regr.predict(x))
scores.append(score * x.shape[0])
print '\t score: {}'.format(score)
print np.array(scores).sum()/2838
In [ ]:
SLOT1 = range(45,153,12)
SLOT2 = range(57,153,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
x = select_last_points(day, RANGE[day])
write3(x, day, S_range[day], Regr, 'a')
In [ ]:
slot1 = range(45,153,12)
slot2 = range(57,153,12)
S_range = {'2016-01-22':slot1, '2016-01-24':slot2, '2016-01-26':slot1, '2016-01-28':slot2, '2016-01-30':slot1}
D_range = range(1,67)
In [ ]:
Day_range = {'2016-01-22':pd.date_range('1/2/2016', periods=3, freq='7D'),
'2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
'2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
'2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
'2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score2(day):
ans = test_order[day].select(lambda x: x[1] in S_range[day])
deltas = []
for d in Day_range[day]:
data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
data = data.diff().shift(-1)
data = data.loc[ans.index]
deltas.append(data)
delta = deltas[0]
for i in range(1,len(deltas)):
delta.add(deltas[i], fill_value=0)
pred = test_order[day].shift().loc[ans.index].fillna(0)+(delta/len(deltas))
gap = (ans - pred) / ans
return gap.abs().sum()/(66*len(S_range[day]))
In [ ]:
print naive_score2('2016-01-22')
print naive_score2('2016-01-24')
print naive_score2('2016-01-26')
print naive_score2('2016-01-28')
print naive_score2('2016-01-30')
In [ ]:
def slope(day):
base_points = test_order[day].select(lambda x: x[1] in S_range[day])
deltas = []
for d in Day_range[day]:
data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
data = data.diff().shift(-1)
deltas.append(data)
delta = deltas[0]
for i in range(1,len(deltas)):
delta = delta + deltas[i]
return base_points, delta/len(deltas)
In [ ]:
def write2(day, base_points, slot, delta, mode):
with open('ans.csv', mode) as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in slot:
key = (D,S-1)
if key in base_points.index:
gap = base_points['gap'].loc[key] + delta['gap'].loc[key]
gap = base_points['gap'].loc[key] if gap < 0 else gap
else:
gap = 0.0
writer.writerow([D,'{}-{}'.format(day,S), '{:.3f}'.format(gap)])
In [ ]:
day = '2016-01-22'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'w')
In [ ]:
day = '2016-01-24'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')
In [ ]:
day = '2016-01-26'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')
In [ ]:
day = '2016-01-28'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')
In [ ]:
day = '2016-01-30'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')
In [ ]:
Day_range = {'2016-01-22':pd.date_range('1/1/2016', periods=3, freq='7D'),
'2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
'2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
'2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
'2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score1(day):
ans = test_order[day].select(lambda x: x[1] in S_range[day])
prediction = []
for d in Day_range[day]:
data = train_order[str(d.date())]
data = data.loc[ans.index].fillna(0)
prediction.append(data)
pred = prediction[0]
for i in range(1,len(prediction)):
pred.add(prediction[i], fill_value=0)
pred = pred/len(prediction)
gap = (ans - pred) / ans
return gap.abs().sum()/(66*len(S_range[day]))
In [ ]:
print naive_score1('2016-01-22')
print naive_score1('2016-01-24')
print naive_score1('2016-01-26')
print naive_score1('2016-01-28')
print naive_score1('2016-01-30')
In [ ]:
test_slot1 = range(46,154,12)
test_slot2 = range(58,154,12)
In [ ]:
test_day22 = []
In [ ]:
for day in pd.date_range('1/1/2016', periods=3, freq='7D'):
# Read data
order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
# Select NA for calculating the value of gap
order = order[order['driver'].isnull()]
# Translating hash to id
order['district'] = order['district'].apply(lambda x: district[x])
# Translating timestamp to time_id
order['time'] = pd.to_datetime(order['time'])
order['time'] = (order['time'] - day) / M / 10 + 1
order['time'] = order['time'].astype(int)
# Grouping by district and time
test_day22.append(order.groupby(['district', 'time']))
In [ ]:
with open('ans.csv', 'w') as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in test_slot1:
key = (D,S)
gap = []
for data in test_day22:
if key in data.groups:
gap.append(data.get_group(key).shape[0])
else:
gap.append(0)
writer.writerow([D,'2016-01-22-{}'.format(S), '{:.3f}'.format(np.mean(gap))])
In [ ]:
test_day24 = []
In [ ]:
for day in pd.date_range('1/3/2016', periods=3, freq='7D'):
# Read data
order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
# Select NA for calculating the value of gap
order = order[order['driver'].isnull()]
# Translating hash to id
order['district'] = order['district'].apply(lambda x: district[x])
# Translating timestamp to time_id
order['time'] = pd.to_datetime(order['time'])
order['time'] = (order['time'] - day) / M / 10 + 1
order['time'] = order['time'].astype(int)
# Grouping by district and time
test_day24.append(order.groupby(['district', 'time']))
In [ ]:
with open('ans.csv', 'a') as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in test_slot2:
key = (D,S)
gap = []
for data in test_day24:
if key in data.groups:
gap.append(data.get_group(key).shape[0])
else:
gap.append(0)
writer.writerow([D,'2016-01-24-{}'.format(S), '{:.3f}'.format(np.mean(gap))])
In [ ]:
test_day26 = []
In [ ]:
for day in pd.date_range('1/5/2016', periods=3, freq='7D'):
# Read data
order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
# Select NA for calculating the value of gap
order = order[order['driver'].isnull()]
# Translating hash to id
order['district'] = order['district'].apply(lambda x: district[x])
# Translating timestamp to time_id
order['time'] = pd.to_datetime(order['time'])
order['time'] = (order['time'] - day) / M / 10 + 1
order['time'] = order['time'].astype(int)
# Grouping by district and time
test_day26.append(order.groupby(['district', 'time']))
In [ ]:
with open('ans.csv', 'a') as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in test_slot1:
key = (D,S)
gap = []
for data in test_day26:
if key in data.groups:
gap.append(data.get_group(key).shape[0])
else:
gap.append(0)
writer.writerow([D,'2016-01-26-{}'.format(S), '{:.3f}'.format(np.mean(gap))])
In [ ]:
test_day28 = []
In [ ]:
for day in pd.date_range('1/7/2016', periods=3, freq='7D'):
# Read data
order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
# Select NA for calculating the value of gap
order = order[order['driver'].isnull()]
# Translating hash to id
order['district'] = order['district'].apply(lambda x: district[x])
# Translating timestamp to time_id
order['time'] = pd.to_datetime(order['time'])
order['time'] = (order['time'] - day) / M / 10 + 1
order['time'] = order['time'].astype(int)
# Grouping by district and time
test_day28.append(order.groupby(['district', 'time']))
In [ ]:
with open('ans.csv', 'a') as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in test_slot2:
key = (D,S)
gap = []
for data in test_day28:
if key in data.groups:
gap.append(data.get_group(key).shape[0])
else:
gap.append(0)
writer.writerow([D,'2016-01-28-{}'.format(S), '{:.3f}'.format(np.mean(gap))])
In [ ]:
test_day30 = []
In [ ]:
for day in pd.date_range('1/9/2016', periods=2, freq='7D'):
# Read data
order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
names=['driver', 'district', 'time'])
# Select NA for calculating the value of gap
order = order[order['driver'].isnull()]
# Translating hash to id
order['district'] = order['district'].apply(lambda x: district[x])
# Translating timestamp to time_id
order['time'] = pd.to_datetime(order['time'])
order['time'] = (order['time'] - day) / M / 10 + 1
order['time'] = order['time'].astype(int)
# Grouping by district and time
test_day30.append(order.groupby(['district', 'time']))
In [ ]:
with open('ans.csv', 'a') as f:
writer = csv.writer(f, delimiter=',')
for D in range(1,67):
for S in test_slot1:
key = (D,S)
gap = []
for data in test_day30:
if key in data.groups:
gap.append(data.get_group(key).shape[0])
else:
gap.append(0)
writer.writerow([D,'2016-01-30-{}'.format(S), '{:.3f}'.format(np.mean(gap))])
In [ ]:
day = '2016-01-01'
Day = pd.Timestamp(day)
In [ ]:
order = pd.read_table(path['order'].format(day), header=None, usecols=[1,3,6],
names=['driver', 'district_id', 'time'])
order = order[order['driver'].isnull()] # Select NA for calculating the value of gap
In [ ]:
order['district_id'] = order['district_id'].apply(lambda x: district[x])
In [ ]:
order['time'] = pd.to_datetime(order['time'])
In [ ]:
order['time_slot'] = (order['time'] - Day) / M / 10 + 1
order['time_slot'] = order['time_slot'].astype(int)