Didi_source



In [5]:
from pandas.tseries.offsets import *
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
import itertools

%matplotlib inline

In [6]:
path = {}
path['train'] = {'order':'./training_data_2/order_data/order_data_{}', 
                    'weather': './training_data_2/weather_data/weather_data_{}',
                    'traffic': './training_data_2/traffic_data/traffic_data_{}',
                    'district':'./training_data_2/cluster_map/cluster_map',
                    'poi':'./training_data_2/poi_data/poi_data'}
path['test'] = {'order':'./test_data_2/order_data/order_data_{}_test', 
                'weather': './test_data_2/weather_data/weather_data_{}_test',
                'traffic': './test_data_2/traffic_data/traffic_data_{}_test',
                'district':'./test_data_2/cluster_map/cluster_map'}

M = np.timedelta64(1, 'm') # base time stamp of 1 minute

D_range = range(1,67) # List of all district Ids
T_range = range(1,145) # List of all time slots

test_slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
test_slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
# List of all need to prediced time slots
S_range = {'2016-01-23':test_slot1, '2016-01-25':test_slot2, '2016-01-27':test_slot1, 
           '2016-01-29':test_slot2, '2016-01-31':test_slot1}

# Dictionary of District Info Table
district_dict = pd.read_table(path['train']['district'], header=None, index_col=0)
district_dict = district_dict[1].to_dict()

In [7]:
print len(district_dict) == 66


True

Utility Funtion


In [8]:
def index(district, slot):
    if type(district) is int:
        if type(slot) is int:
            return [x for x in itertools.product([district],[slot])]
        else:
            return [x for x in itertools.product([district],slot)]
    else:
        if type(slot) is int:
            return [x for x in itertools.product(district,[slot])]
        else:
            return [x for x in itertools.product(district,slot)]

In [9]:
def District(df):
    return df['district'].apply(lambda x: district_dict[x])

In [10]:
def Weekday(df):
    return pd.to_datetime(df['time']).apply(lambda x: x.weekday())

In [11]:
def Time(df, day):
    time = pd.to_datetime(df['time'])
    time = (time - pd.Timestamp(day)) / M / 10 + 1
    return time.astype(int)

Getting testing data and training data


In [12]:
def Traffic(day, option):
    df = pd.read_table(path[option]['traffic'].format(day.date()), header=None,
                      names=['district', 'LV1', 'LV2', 'LV3', 'LV4','time'])
    df['district'] = District(df)
    df['weekday'] = Weekday(df)
    df['time'] = Time(df, str(day.date()))
    for L in ['LV{}'.format(n) for n in range(1,5)]:
        df[L]=df[L].apply(lambda x: x.split(':')[1]).astype(int)
    index = pd.MultiIndex.from_arrays([df['district'].values, df['time'].values], names=('district', 'time'))
    return pd.DataFrame({'weekday':df['weekday'].values,
                         'day':day.day,
                         'district':df['district'].values,
                         'time':df['time'].values,
                         'LV1':df['LV1'].values, 
                         'LV2':df['LV2'].values,
                         'LV3':df['LV3'].values,
                         'LV4':df['LV4'].values,}, index=index).sort_index()

In [13]:
# DTG: District Time Gap
def DTG(day, option):
    df = pd.read_table(path[option]['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    df = df[df['driver'].isnull()] 
    df['district'] = District(df)
    df['time'] = Time(df, day.date())
    Order = df.groupby(['district', 'time'])
    return pd.DataFrame({'gap':Order.size()})

In [14]:
def Weather(day, option):
    df = pd.read_table(path[option]['weather'].format(day.date()), header=None,
                      names=['time', 'weather', 'temprature', 'pm2.5'])
    df['time'] = Time(df, day.date())
    df = df.drop_duplicates(subset='time')
    DF = pd.DataFrame({'time': T_range}, columns=df.columns)
    DF = DF.set_index('time')
    DF.update(df.set_index('time'))
    return DF.fillna(method='bfill').fillna(method='ffill')

In [15]:
# Dictionary of order data for testing, indexed by date, cols = [gap]
test_order = {} 
# Dictionary of traffic data for testing, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
test_traffic = {}
# Dictionary of weather data for testing, indexed by date, cols = [temperature, weather, pm2.5]
test_weather = {} 
for day in pd.date_range('1/23/2016', periods=5, freq='2D'):
    test_order[str(day.date())] = DTG(day, 'test')
    test_traffic[str(day.date())] = Traffic(day, 'test')
    test_weather[str(day.date())] = Weather(day, 'test')

In [16]:
# Check data
print len(test_order.keys()) == 5
print len(test_traffic.keys()) == 5
print len(test_weather.keys()) == 5
print all(test_order['2016-01-23'].columns.values == np.array(['gap']))
print all(test_traffic['2016-01-23'].columns.values == np.array(['LV1', 'LV2', 'LV3', 'LV4', 'day', 'district', 'time', 'weekday']))
print all(test_weather['2016-01-23'].columns.values == np.array(['weather', 'temprature', 'pm2.5']))


True
True
True
True
True
True

In [21]:
# Dictionary of order data for training, indexed by date, cols = [gap]
# train_order = {}
# Dictionary of traffic data for training, indexed by date, cols = [weekday, district, time, LV1, LV2, LV3, LV4]
train_traffic = {}
# Dictionary of weather data for training, indexed by date, cols = [temperature, weather, pm2.5]
train_weather = {}
for day in pd.date_range('1/1/2016', periods=21, freq='D'):
#     train_order[str(day.date())] = DTG(day, 'train')
    train_traffic[str(day.date())] = Traffic(day, 'train')
    train_weather[str(day.date())] = Weather(day, 'train')

In [14]:
# Check data
print len(train_order.keys()) == 21
print len(train_traffic.keys()) == 21
print len(train_weather.keys()) == 21


True
True
True

POI


In [36]:
# POI2 = pd.DataFrame()
# with open(path['train']['poi'], 'r') as f:
#     for i, line in enumerate(f): 
#         interests = line.strip().split('\t')
#         row = {'district': interests[0]}
#         for item in interests[1:]:
#             category,num = item.split(':')
#             if int(category.split('#')[0]) in [4,13,16,23,24]:
#                 if category in row:
#                     row[category] += int(num)
#                 else:
#                     row[category] = int(num)
#         POI2 = pd.concat( [POI2, pd.DataFrame(row, index=[i])])
# POI2['district'] = District(POI2)
# POI2 = POI2.set_index('district').sort_index()
# POI2 = POI2.fillna(0)
# POI2.to_csv('./POI2.csv', columns=POI2.columns, header=True)

POI2 = pd.read_csv('./POI2.csv', index_col='district')

In [37]:
# columns = ['district'] + range(1,26)
# POI = pd.DataFrame(columns=columns)
# with open(path['train']['poi'], 'r') as f:
#     for i, line in enumerate(f): 
#         interests = line.strip().split('\t')
#         row = {'district': interests[0]}
#         for item in interests[1:]:
#             category,num = item.split(':')
#             category = int(category.split('#')[0])
#             if category in row:
#                 row[category] += int(num)
#             else:
#                 row[category] = int(num)
#         POI = pd.concat( [POI, pd.DataFrame(row, index=[i], columns=columns)])
# POI['district'] = District(POI)
# POI = POI.set_index('district').sort_index()
# POI = POI.fillna(0)
# # Standardization
# POI = (POI - POI.mean()) / POI.std()
# POI.to_csv('./POI.csv', columns=POI.columns, header=True)

POI = pd.read_csv('./POI.csv', index_col='district')

Explore which district is the best for replacing district 54


In [652]:
# Mergin weather and gap data each day, and store it in a single csv
# def preprocessing1(day):
#     X = train_order[day].reindex(index(D_range,T_range)).fillna(0)
#     X['last'] = X['gap'].shift().fillna(method='bfill')
#     X['last2'] = X['gap'].shift(2).fillna(method='bfill') 
#     X['GAP'] = pd.Series(index=X.index)
#     X['time'] = pd.Series(index=X.index)
#     X['district'] = pd.Series(index=X.index)
#     for D in D_range:
#         X.loc[(D, T_range), 'GAP'] = X.loc[(D, T_range),'gap'].shift(-1).fillna(method='ffill')
#         X.loc[(D, T_range), 'time'] = pd.DataFrame(T_range, columns=['time'],index=index(D, T_range))
#         X.loc[(D, T_range), 'district'] = pd.DataFrame([D]*144, columns=['district'],index=index(D, T_range))
#     X = X.join(train_weather[day], on='time')
#     Y_gap = X['GAP']
#     X.drop('GAP', axis=1, inplace=True)
#     return X, Y_gap

# for d in pd.date_range('1/2/2016', periods=20, freq='D'):
#     X, Y_gap = preprocessing1(str(d.date()))
    
#     X = X.join(POI2,on='district')
#     X.sort_index(inplace=True)
    
#     Y_gap.sort_index(inplace=True)
#     X.to_csv('./X2/{}.csv'.format(str(d.date())), columns=X.columns, header=True)
#     Y_gap.to_csv('./Y2_gap/{}.csv'.format(str(d.date())), header=True)

In [ ]:
# # from sklearn.ensemble import GradientBoostingClassifier
# # from sklearn.cross_validation import train_test_split

# without54 = range(1,67)
# without54.remove(54)
# def preprocessing2(day, Mask):
#     # filling traffic of district 54 with other district    
#     traffic_rest = train_traffic[day].reindex(index(without54, T_range)).fillna(method='bfill')
#     mask = Mask # The district replace district 54
#     traffic_54 = pd.DataFrame(traffic_rest.loc[index(mask, T_range)].values, 
#                               index=index(54, T_range), columns=traffic_rest.columns)
#     return traffic_rest, traffic_54


# for D in without54:
#     X_rest = []
#     Y_gap_rest = []
#     X_54 = []
#     Y_gap_54 = []
#     for d in pd.date_range('1/2/2016', periods=20, freq='D'):
#         X_all = pd.read_csv('./X2/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
#         Y_gap_all = pd.read_csv('./Y2_gap/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
#         traffic_rest, traffic_54 = preprocessing2(str(d.date()), D)
#         tempX_rest = pd.concat([X_all.loc[(without54, T_range),:], traffic_rest], axis=1)
#         tempX_54 = pd.concat([X_all.loc[(54, T_range),:], traffic_54],axis=1)
#         tempX_rest = tempX_rest.drop(['time.1','district.1'],1)
#         tempX_54 = tempX_54.drop(['time.1','district.1'],1)
#         X_rest.append(tempX_rest)
#         X_54.append(tempX_54)
#         Y_gap_rest.append(Y_gap_all.loc[(without54, T_range),:])
#         Y_gap_54.append(Y_gap_all.loc[(54, T_range),:])
#     X_rest = pd.concat(X_rest)
#     X_rest.sort_index(inplace=True)
#     X_54 = pd.concat(X_54)
#     X_54.sort_index(inplace=True)
#     Y_gap_rest = pd.concat(Y_gap_rest)
#     Y_gap_rest.sort_index(inplace=True)
#     Y_gap_54 = pd.concat(Y_gap_54)
#     Y_gap_54.sort_index(inplace=True)


#     Y_gap_rest[Y_gap_rest>10]=11
#     Y_gap_54[Y_gap_54>10]=11
      
#     columns = ['gap', 'last', 'last2', 'LV1', 'LV2', 'LV3', 'LV4', 'district', 'time', 'pm2.5']
#     X_train, X_test, y_train, y_test = train_test_split(X_rest[columns], Y_gap_rest, test_size=0.4)
#     params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 20, 'min_samples_leaf':10, 'min_samples_split':100,
#               'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
#               'verbose':0}
#     grd = GradientBoostingClassifier(**params)
#     grd.fit(X_train[columns], y_train['GAP'])
    
#     testX = X_54[Y_gap_54['GAP']>0][columns]
#     testY = Y_gap_54[Y_gap_54['GAP']>0]['GAP']
#     print "Replaced by disctric {} : {:.8f}".format(D,((testY - grd.predict(testX)).abs() / testY).sum() / testY.shape[0])

In [ ]:
# Replaced by disctric 17 : 0.98912147
# Replaced by disctric 18 : 0.98698988
# Replaced by disctric 19 : 0.77650698
# Replaced by disctric 20 : 0.72371558  -> 3rd
# Replaced by disctric 21 : 0.90578814
# Replaced by disctric 22 : 0.85776423
# Replaced by disctric 23 : 3.11777664
# Replaced by disctric 24 : 0.59887103  -> 1st
# Replaced by disctric 25 : 0.74924907   
# Replaced by disctric 26 : 0.98615067
# Replaced by disctric 27 : 0.80867298
# Replaced by disctric 28 : 0.61996468  -> 2nd
# Replaced by disctric 29 : 0.85480506
# Replaced by disctric 30 : 0.98992951
# Replaced by disctric 31 : 0.97505635
# Replaced by disctric 32 : 1.00273342
# Replaced by disctric 33 : 0.99017887
# Replaced by disctric 34 : 0.98325421
# Replaced by disctric 35 : 0.99271328
# Replaced by disctric 36 : 0.99436532
# Replaced by disctric 37 : 1.04983252
# Replaced by disctric 38 : 0.97790965
# Replaced by disctric 39 : 0.99463147
# Replaced by disctric 40 : 0.99509663
# Replaced by disctric 41 : 0.98831103
# Replaced by disctric 42 : 0.74919821
# Replaced by disctric 43 : 0.99647533
# Replaced by disctric 44 : 0.99103486
# Replaced by disctric 45 : 0.98905673
# Replaced by disctric 46 : 0.90203604
# Replaced by disctric 47 : 0.97719033
# Replaced by disctric 48 : 1.65585771
# Replaced by disctric 49 : 0.99523330
# Replaced by disctric 50 : 0.99402964
# Replaced by disctric 51 : 3.39464397
# Replaced by disctric 52 : 0.99622356
# Replaced by disctric 53 : 0.99042104
# Replaced by disctric 55 : 0.99966432
# Replaced by disctric 56 : 0.99899295
# Replaced by disctric 57 : 0.98358989
# Replaced by disctric 58 : 0.99629550
# Replaced by disctric 59 : 0.99899295
# Replaced by disctric 60 : 0.99699324
# Replaced by disctric 61 : 0.99699324
# Replaced by disctric 62 : 1.00290126
# Replaced by disctric 63 : 0.99707956
# Replaced by disctric 64 : 0.99622356
# Replaced by disctric 65 : 0.99874119
# Replaced by disctric 66 : 0.99471299

Preparing training data X and Y


In [23]:
# Filling traffic of district 54 with other district
def replace54(day, mask):
    without54 = range(1,67)
    without54.remove(54)
    traffic_rest = train_traffic[day].reindex(index(without54, T_range)).fillna(method='bfill')
    traffic_54 = pd.DataFrame(traffic_rest.loc[index(mask, T_range)].values, 
                              index=index(54, T_range), columns=traffic_rest.columns)
    return traffic_rest, traffic_54

In [18]:
def Masking(Mask):
    # Mask is the district replacing district 54
    without54 = range(1,67)
    without54.remove(54)
    X = []
    Y = []
    for d in pd.date_range('1/2/2016', periods=20, freq='D'):
        X_all = pd.read_csv('./X2/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
        Y_all = pd.read_csv('./Y2_gap/{}.csv'.format(str(d.date())), index_col=('district', 'time'))
        traffic_rest, traffic_54 = replace54(str(d.date()), Mask)

        tempX_rest = pd.concat([X_all.loc[(without54, T_range),:], traffic_rest], axis=1)
        tempX_rest = tempX_rest.drop(['time.1','district.1'],1)
        tempX_54 = pd.concat([X_all.loc[(54, T_range),:], traffic_54],axis=1)    
        tempX_54 = tempX_54.drop(['time.1','district.1'],1)    
        tempX_all = pd.concat([tempX_rest, tempX_54]).sort_index()
        X.append(tempX_all)

        tempY_rest = Y_all.loc[(without54, T_range),:]
        tempY_54 = Y_all.loc[(54, T_range),:]
        tempY_all = pd.concat([tempY_rest, tempY_54]).sort_index()
        Y.append(tempY_all)
    X = pd.concat(X)
    Y = pd.concat(Y)
    return X, Y

Developing Zone


In [ ]:

Explore which district is better on POI2


In [769]:
# Best District : 9
# without54 = range(1,67)
# without54.remove(54)
# for D in without54:
#     X, Y = Masking(D)
#     newY = Y.copy()
#     newX = X.copy()
#     newX = newX[newY['GAP']>0]
#     newX = newX.drop('last2',axis=1)
#     newY = newY[newY['GAP']>0]
#     newY[newY['GAP']>39]=40
#     class_weight = dict(zip(range(1,38), range(38,1,-1)))
#     columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
#                'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
#     from sklearn.ensemble import RandomForestClassifier
#     params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
#               'max_depth': 9, 'max_features': 0.85, 'random_state':1,
#               'verbose':0, 'class_weight': class_weight}
#     rfc = RandomForestClassifier(**params)
#     rfc.fit(newX[columns], newY['GAP'])
#     print "Replaced by disctric {} : {:.8f}".format(D, score_on_test_data(rfc, columns))


Replaced by disctric 1 : 0.43134926
Replaced by disctric 2 : 0.43239209
Replaced by disctric 3 : 0.43662162
Replaced by disctric 4 : 0.43378158
Replaced by disctric 5 : 0.43164034
Replaced by disctric 6 : 0.43288644
Replaced by disctric 7 : 0.42900990
Replaced by disctric 8 : 0.43107097
Replaced by disctric 9 : 0.42519920
Replaced by disctric 10 : 0.43102830
Replaced by disctric 11 : 0.42780553
Replaced by disctric 12 : 0.43088756
Replaced by disctric 13 : 0.43555360
Replaced by disctric 14 : 0.43099010
Replaced by disctric 15 : 0.43549541
Replaced by disctric 16 : 0.43234372
Replaced by disctric 17 : 0.43835595
Replaced by disctric 18 : 0.43056354
Replaced by disctric 19 : 0.43265371
Replaced by disctric 20 : 0.43311840
Replaced by disctric 21 : 0.43048995
Replaced by disctric 22 : 0.43118376
Replaced by disctric 23 : 0.43293011
Replaced by disctric 24 : 0.43149041
Replaced by disctric 25 : 0.43299912
Replaced by disctric 26 : 0.43039004
Replaced by disctric 27 : 0.43214967
Replaced by disctric 28 : 0.42867821
Replaced by disctric 29 : 0.42982001
Replaced by disctric 30 : 0.43205035
Replaced by disctric 31 : 0.43440043
Replaced by disctric 32 : 0.43541702
Replaced by disctric 33 : 0.43102059
Replaced by disctric 34 : 0.43742158
Replaced by disctric 35 : 0.43564980
Replaced by disctric 36 : 0.42799378
Replaced by disctric 37 : 0.42624577
Replaced by disctric 38 : 0.43094743
Replaced by disctric 39 : 0.43339908
Replaced by disctric 40 : 0.43751719
Replaced by disctric 41 : 0.43120191
Replaced by disctric 42 : 0.43184901
Replaced by disctric 43 : 0.43323008
Replaced by disctric 44 : 0.43530426
Replaced by disctric 45 : 0.43030129
Replaced by disctric 46 : 0.43296481
Replaced by disctric 47 : 0.43371596
Replaced by disctric 48 : 0.43144904
Replaced by disctric 49 : 0.43245215
Replaced by disctric 50 : 0.43588607
Replaced by disctric 51 : 0.43425369
Replaced by disctric 52 : 0.43262960
Replaced by disctric 53 : 0.43371754
Replaced by disctric 55 : 0.43128574
Replaced by disctric 56 : 0.43303190
Replaced by disctric 57 : 0.43176025
Replaced by disctric 58 : 0.43252662
Replaced by disctric 59 : 0.42745728
Replaced by disctric 60 : 0.43190217
Replaced by disctric 61 : 0.43445249
Replaced by disctric 62 : 0.43796914
Replaced by disctric 63 : 0.43437871
Replaced by disctric 64 : 0.43070752
Replaced by disctric 65 : 0.43270655
Replaced by disctric 66 : 0.43217432

Identified the most common gap growth each day


In [ ]:
# DF = pd.DataFrame(columns=range(1,12))
# for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
#     df = train_order[str(d.date())]
#     df = df.reindex(index(D_range, T_range)).fillna(0)
#     for D in D_range:
#         df.loc[(D, T_range),'diff'] = df.loc[(D, T_range),'gap'].diff().shift(-1).fillna(0)
#     row = pd.DataFrame(df['diff'].value_counts().sort_values(ascending=False).iloc[:11].index.values.reshape((1,11)),
#                        columns=range(1,12), index=[d.day])
#     DF = DF.append(row)
# print DF

Identified the most common gap each day


In [ ]:
# DF = pd.DataFrame(columns=range(1,12))
# for d in pd.date_range('1/2/2016', periods=20, freq='1D'):
#     df = train_order[str(d.date())]
#     row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
#                        columns=range(1,12), index=[d.day])
#     DF = DF.append(row)

In [ ]:
# for d in pd.date_range('1/23/2016', periods=5, freq='2D'):
#     df = test_order[str(d.date())]
#     row = pd.DataFrame(df['gap'].value_counts().sort_values(ascending=False)[:11].index.values.reshape((1,11)),
#                        columns=range(1,12), index=[d.day])
#     DF = DF.append(row)

In [ ]:
# columns is the top 11 most common gap
# index is each day
# print DF

Method 5 by Random Forest Classifier


In [30]:
def score_GBD(day, pred):
    ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = ans['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    return np.fabs(gap).sum()/ans.shape[0]

In [31]:
def score_on_test_data(clf, columns):
    SLOT1 = range(44,152,12)
    SLOT2 = range(56,152,12)
    RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
    scores = []
    for day in RANGE.keys():
        x = select_last_points(day, RANGE[day])
        if len(columns) == 0:
            score = score_GBD(day, clf.predict(x))
        else:
            score = score_GBD(day, clf.predict(x[columns]))
        scores.append(score * x.shape[0])
#         print '\t score: {}'.format(score)
    return np.array(scores).sum()/2838

In [32]:
def select_last_points(day, slot):
    slot = np.array(slot)
    x = test_order[day].reindex(index(D_range, slot)).fillna(0)
    x['last'] = test_order[day].reindex(index(D_range, slot-1)).fillna(0)
    x = pd.concat([x, test_traffic[day].reindex(index(D_range, slot))],axis=1)
    # For missing traffic data on district 54, replaced by Mask
    for t in slot:
        x.loc[(54,t)]['LV1':'weekday'] = x.loc[(Mask,t)]['LV1':'weekday']
    x = x.join(test_weather[day], on='time')
    x = x.join(POI2,on='district')
#     print "Select data from {} on {}".format(day, slot)
#     print "\t shape: {}".format(x.shape)
    return x

In [34]:
Mask = 9
X, Y = Masking(Mask)

Training


In [409]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newX.drop('last2',1), newY['GAP'], test_size=0.3)

In [599]:
class_weight = dict(zip(range(1,44), range(44,1,-1)))
from sklearn.ensemble import ExtraTreesClassifier
params = {'criterion': 'entropy', 'n_estimators': 30, 'min_samples_leaf':1, 'min_samples_split':2,
          'max_depth': 11, 'max_features': 0.9, 'random_state':1,
          'verbose':0, 'class_weight': class_weight}
etc = ExtraTreesClassifier(**params)
etc.fit(newX[columns], newY['GAP'])


Out[599]:
ExtraTreesClassifier(bootstrap=False,
           class_weight={1: 44, 2: 43, 3: 42, 4: 41, 5: 40, 6: 39, 7: 38, 8: 37, 9: 36, 10: 35, 11: 34, 12: 33, 13: 32, 14: 31, 15: 30, 16: 29, 17: 28, 18: 27, 19: 26, 20: 25, 21: 24, 22: 23, 23: 22, 24: 21, 25: 20, 26: 19, 27: 18, 28: 17, 29: 16, 30: 15, 31: 14, 32: 13, 33: 12, 34: 11, 35: 10, 36: 9, 37: 8, 38: 7, 39: 6, 40: 5, 41: 4, 42: 3, 43: 2},
           criterion='entropy', max_depth=11, max_features=0.9,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [600]:
score_on_test_data(etc, columns)


Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.47556958099
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.458208490768
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.490209217019
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.423029987208
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.464640080984
0.461629463133

In [816]:
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newX = newX.drop('last2',axis=1)
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40

In [772]:
class_weight = dict(zip(range(1,38), range(38,1,-1)))
# columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
#            'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
from sklearn.ensemble import RandomForestClassifier
params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
          'max_depth': 9, 'max_features': 0.85, 'random_state':1,
          'verbose':0, 'class_weight': class_weight}
rfc = RandomForestClassifier(**params)
rfc.fit(newX, newY['GAP'])
# rfc.fit(newX[columns], etc.predict(newX[columns]))


Out[772]:
RandomForestClassifier(bootstrap=True,
            class_weight={1: 38, 2: 37, 3: 36, 4: 35, 5: 34, 6: 33, 7: 32, 8: 31, 9: 30, 10: 29, 11: 28, 12: 27, 13: 26, 14: 25, 15: 24, 16: 23, 17: 22, 18: 21, 19: 20, 20: 19, 21: 18, 22: 17, 23: 16, 24: 15, 25: 14, 26: 13, 27: 12, 28: 11, 29: 10, 30: 9, 31: 8, 32: 7, 33: 6, 34: 5, 35: 4, 36: 3, 37: 2},
            criterion='entropy', max_depth=9, max_features=0.85,
            max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [828]:
# print rfc.feature_importances_
columns = newX.columns[[i for i, important in enumerate(rfc.feature_importances_ > 0.01) if important]].values
print columns


['gap' 'last' 'pm2.5' '16#10' '16#4' '24#2' 'LV1' 'LV2' 'LV3' 'LV4' 'day'
 'time']

In [26]:
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newX = newX.drop('last2',axis=1)
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40
# from sklearn.cross_validation import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(newX[columns], newY['GAP'], test_size=0.7)

In [50]:
columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
               'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
from sklearn.ensemble import RandomForestClassifier

class_weight = dict(zip(range(1,38), range(38,1,-1)))
params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
          'max_depth': 9, 'max_features': 0.85, 'random_state':1,
          'verbose':0, 'class_weight': class_weight}
rfc1 = RandomForestClassifier(**params)
rfc1.fit(newX[columns], newY['GAP'])
print score_on_test_data(rfc1, columns)


0.423280995219

In [832]:
# Best District : 9
# without54 = range(1,67)
# without54.remove(54)
# for D in [9,11,19,20,24,28,29,36,37,59]:
#     X, Y = Masking(D)
#     newY = Y.copy()
#     newX = X.copy()
#     newX = newX[newY['GAP']>0]
#     newX = newX.drop('last2',axis=1)
#     newY = newY[newY['GAP']>0]
#     newY[newY['GAP']>39]=40
#     class_weight = dict(zip(range(1,38), range(38,1,-1)))
#     columns = ['gap', 'last', 'pm2.5', '16#4', '16#10', '24#2',
#                'LV1', 'LV2', 'LV3', 'LV4', 'day', 'time']
#     from sklearn.ensemble import RandomForestClassifier
#     params = {'criterion': 'entropy', 'n_estimators': 40, 'min_samples_leaf':10, 'min_samples_split':20,
#               'max_depth': 9, 'max_features': 0.85, 'random_state':1,
#               'verbose':0, 'class_weight': class_weight}
#     rfc = RandomForestClassifier(**params)
#     rfc.fit(newX[columns], newY['GAP'])
#     print "Replaced by disctric {} : {:.8f}".format(D, score_on_test_data(rfc, columns))


Replaced by disctric 9 : 0.42519920
Replaced by disctric 11 : 0.42780553
Replaced by disctric 19 : 0.43265371
Replaced by disctric 20 : 0.43311840
Replaced by disctric 24 : 0.43149041
Replaced by disctric 28 : 0.42867821
Replaced by disctric 29 : 0.42982001
Replaced by disctric 36 : 0.42799378
Replaced by disctric 37 : 0.42624577
Replaced by disctric 59 : 0.42745728

In [116]:
from sklearn.ensemble import GradientBoostingRegressor
params = {'loss': 'quantile', 'alpha': 0.85, 'n_estimators':100, 'max_features':0.85, 'random_state':1,
          'min_samples_split':10,'max_depth': 10, 'learning_rate': 0.08, 'subsample': 0.9, 'verbose':0}
Regr = GradientBoostingRegressor(**params)
Regr.fit(newX[columns], rfc1.predict(newX[columns]))
print score_on_test_data(Regr, columns)


0.420953714675

In [109]:
def write5(x, day, slot, rfc, mode):
    with open('ans5_v3.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in D_range:
            for S in slot:
                gap = rfc.predict(x.loc[(D,S)].reshape(1, -1))[0]
                writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])
for day in S_range.keys():
    x = select_last_points(day, S_range[day])
    write5(x[columns], day, S_range[day], Regr, 'a')

In [117]:
from sklearn.ensemble import AdaBoostRegressor
params = {'base_estimator':Regr,'n_estimators':5, 'learning_rate':1.0, 'random_state':1, 'loss':'square'}
adbr = AdaBoostRegressor(**params)
adbr.fit(newX[columns], newY['GAP'])
print score_on_test_data(adbr, columns)


1.36590289181

In [91]:
from sklearn.ensemble import BaggingRegressor
params = {'base_estimator':Regr, 'verbose':1,'n_estimators':5}

bgr = BaggingRegressor(**params)
bgr.fit(newX[columns], Regr.predict(newX[columns]))
print score_on_test_data(bgr, columns)


      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0196           0.2006            1.73m
         2           0.8967           0.1202            1.83m
         3           0.8181           0.0925            1.79m
         4           0.7182           0.1001            1.80m
         5           0.6360           0.0865            1.82m
         6           0.5861           0.0613            1.81m
         7           0.5151           0.0594            1.97m
         8           0.4645           0.0509            2.22m
         9           0.4053           0.0511            2.35m
        10           0.3739           0.0378            2.48m
        20           0.1314           0.0138            2.50m
        30           0.0353           0.0054            2.53m
        40          -0.0031           0.0015            2.03m
        50          -0.0168           0.0006            1.60m
        60          -0.0206           0.0001            1.24m
        70          -0.0213          -0.0002           54.13s
        80          -0.0207          -0.0000           35.91s
        90          -0.0200          -0.0001           17.64s
       100          -0.0194          -0.0001            0.00s
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0049           0.1696            1.94m
         2           0.8963           0.1069            1.96m
         3           0.7880           0.1020            2.05m
         4           0.7181           0.0739            2.07m
         5           0.6300           0.0784            2.33m
         6           0.5651           0.0639            2.35m
         7           0.4991           0.0586            2.35m
         8           0.4470           0.0526            2.33m
         9           0.3971           0.0467            2.37m
        10           0.3631           0.0368            2.42m
        20           0.1168           0.0132            2.15m
        30           0.0259           0.0039            1.81m
        40          -0.0105           0.0016            1.52m
        50          -0.0216           0.0004            1.25m
        60          -0.0239          -0.0002            1.00m
        70          -0.0233          -0.0001           44.91s
        80          -0.0223          -0.0002           30.39s
        90          -0.0211          -0.0002           15.23s
       100          -0.0201          -0.0001            0.00s
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0197           0.1885            1.84m
         2           0.9192           0.1036            1.99m
         3           0.8121           0.0926            2.16m
         4           0.7271           0.0904            2.28m
         5           0.6532           0.0689            2.29m
         6           0.5854           0.0598            2.25m
         7           0.5132           0.0674            2.26m
         8           0.4680           0.0486            2.28m
         9           0.4203           0.0447            2.25m
        10           0.3683           0.0486            2.24m
        20           0.1333           0.0140            2.13m
        30           0.0333           0.0073            1.80m
        40          -0.0044           0.0026            1.56m
        50          -0.0173           0.0007            1.33m
        60          -0.0208           0.0002            1.04m
        70          -0.0213          -0.0003           46.08s
        80          -0.0210          -0.0001           31.72s
        90          -0.0202          -0.0001           15.59s
       100          -0.0196          -0.0001            0.00s
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0078           0.2214            2.32m
         2           0.9268           0.0908            2.41m
         3           0.8097           0.1015            2.46m
         4           0.7120           0.0965            2.45m
         5           0.6414           0.0830            2.56m
         6           0.5769           0.0731            2.65m
         7           0.5161           0.0587            2.69m
         8           0.4657           0.0488            2.68m
         9           0.4097           0.0490            2.65m
        10           0.3659           0.0432            2.67m
        20           0.1280           0.0151            2.30m
        30           0.0317           0.0067            2.13m
        40          -0.0054           0.0020            1.75m
        50          -0.0182           0.0005            1.42m
        60          -0.0218           0.0001            1.13m
        70          -0.0221          -0.0001           50.97s
        80          -0.0219          -0.0002           33.90s
        90          -0.0203          -0.0002           17.13s
       100          -0.0196          -0.0002            0.00s
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.9936           0.1952            5.71m
         2           0.9137           0.0911            5.58m
         3           0.8056           0.0937            4.69m
         4           0.7150           0.0840            4.24m
         5           0.6263           0.0857            4.02m
         6           0.5721           0.0653            3.82m
         7           0.5061           0.0594            3.75m
         8           0.4534           0.0506            3.79m
         9           0.3973           0.0516            3.68m
        10           0.3701           0.0356            3.58m
        20           0.1213           0.0149            3.33m
        30           0.0300           0.0065            2.59m
        40          -0.0055           0.0023            2.02m
        50          -0.0173           0.0006            1.57m
        60          -0.0210           0.0000            1.21m
        70          -0.0217          -0.0001           52.62s
        80          -0.0213          -0.0001           35.00s
        90          -0.0206          -0.0001           17.65s
       100          -0.0203          -0.0001            0.00s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 13.8min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
0.42649308167

In [737]:
from sklearn.ensemble import GradientBoostingClassifier
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
          'verbose':0}
grd = GradientBoostingClassifier(**params)
grd.fit(newX[columns], newY['GAP'])
print score_on_test_data(grd, columns)


Out[737]:
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.8, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [628]:
from sklearn.ensemble import BaggingClassifier
params = {'base_estimator':grd, 'verbose':1,'n_estimators':5}

bgc = BaggingClassifier(**params)
bgc.fit(newX[columns], grd.predict(newX[columns]))


      Iter       Train Loss   Remaining Time 
         1      172624.1761           16.64m
         2      143984.9064           15.83m
         3      124735.3358           14.93m
         4      109674.3424           14.32m
         5       97780.2853           13.76m
         6       88028.6071           13.32m
         7       79930.7874           12.98m
         8       73394.8895           12.72m
         9       67911.2652           12.42m
        10       63166.6599           12.20m
        20       38950.5329            9.00m
        30       30167.2405            5.95m
        40       25634.6061            2.94m
        50       22713.8607            0.00s
      Iter       Train Loss   Remaining Time 
         1      173763.3828           14.78m
         2      145428.7745           14.36m
         3      125591.3903           13.86m
         4      110824.4021           13.60m
         5       98875.8994           13.29m
         6       89403.9278           12.99m
         7       81524.0367           12.74m
         8       75007.7739           12.64m
         9       69286.0766           12.35m
        10       64612.5094           12.07m
        20       40385.1230            9.02m
        30       31553.2203            5.99m
        40       27216.2519            2.96m
        50       27129.4854            0.00s
      Iter       Train Loss   Remaining Time 
         1      170729.8553           14.00m
         2      143300.6151           13.77m
         3      124078.0875           13.43m
         4      108904.5696           13.25m
         5       96948.8987           13.02m
         6       87518.0566           12.70m
         7       79625.4242           12.42m
         8       73063.4792           12.16m
         9       67372.3198           11.87m
        10       62567.2018           11.58m
        20       38225.0918            8.77m
        30       29558.6664            5.81m
        40       25716.8432            2.91m
        50       23955.4922            0.00s
      Iter       Train Loss   Remaining Time 
         1      173724.0734           14.03m
         2      145361.7250           13.80m
         3      125373.1050           13.52m
         4      110316.2540           13.26m
         5       97975.2287           12.98m
         6       88169.7210           12.65m
         7       80147.0268           12.63m
         8       73652.3165           12.38m
         9       68063.5403           12.11m
        10       63213.4783           11.82m
        20       38648.7288            8.86m
        30       29793.9327            5.92m
        40       25188.7718            3.02m
        50       22686.0381            0.00s
      Iter       Train Loss   Remaining Time 
         1      174809.2763           16.58m
         2      146621.2834           15.48m
         3      126695.3826           14.83m
         4      111324.5229           14.33m
         5       98989.6776           13.93m
         6       89088.5690           13.60m
         7       81040.5279           13.22m
         8       74343.1164           12.84m
         9       68713.4133           12.51m
        10       63924.5349           12.14m
        20       39382.4675            9.01m
        30       30483.4171            5.89m
        40       19988.1372            2.95m
        50       13755.1015            0.00s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 73.3min finished
Out[628]:
BaggingClassifier(base_estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=0.8, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=20,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=1, subsample=1.0, verbose=1,
              warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=5, n_jobs=1, oob_score=False,
         random_state=None, verbose=1, warm_start=False)

In [629]:
score_on_test_data(bgc, columns)


Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.439117662744
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
	 shape: (594, 38)
	 score: 0.402798003284
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.462474864258
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
	 shape: (594, 38)
	 score: 0.401695452452
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.426770371535
0.425731673318
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished

In [624]:
score_on_test_data(Regr, columns)


Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.437484085351
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.404582771329
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.452958169253
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.415666324248
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.448645492101
0.430986695795

In [ ]:


In [525]:
# Select important columns
# columns = X_train.columns[[i for i, important in enumerate(rfc.feature_importances_ > 0.001) if important]].values
columns = ['gap', 'last', '4', '13', '16', '23', '24', 'LV1', 'LV2', 'LV3', 'time']
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newX[columns], newY['GAP'], test_size=0.3)

In [526]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import grid_search
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
parameters = {'min_samples_split': np.arange(60, 140, 20)}

# params = {'criterion': 'entropy', 'n_estimators':40, 'min_samples_leaf':10, 'min_samples_split':60,
#           'max_depth':9, 'max_features':0.85, 'min_samples_split': 20, 'random_state':1,
#           'verbose':0, 'class_weight': dict(zip(range(1,20), range(20,1,-1)))}

# rfc = GridSearchCV(RandomForestClassifier(**params), parameters, scoring=scoring_function)
rfc = RandomForestClassifier(**params)
rfc.fit(X_train, y_train)


Out[526]:
RandomForestClassifier(bootstrap=True,
            class_weight={1: 38, 2: 37, 3: 36, 4: 35, 5: 34, 6: 33, 7: 32, 8: 31, 9: 30, 10: 29, 11: 28, 12: 27, 13: 26, 14: 25, 15: 24, 16: 23, 17: 22, 18: 21, 19: 20, 20: 19, 21: 18, 22: 17, 23: 16, 24: 15, 25: 14, 26: 13, 27: 12, 28: 11, 29: 10, 30: 9, 31: 8, 32: 7, 33: 6, 34: 5, 35: 4, 36: 3, 37: 2},
            criterion='entropy', max_depth=9, max_features=0.85,
            max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [528]:
print ((y_test - rfc.predict(X_test)).abs() / y_test).sum() / y_test.shape[0]


0.409512573031

In [351]:
rfc.grid_scores_


Out[351]:
[mean: -21.25011, std: 0.05212, params: {'min_samples_split': 60},
 mean: -21.70977, std: 0.30694, params: {'min_samples_split': 80},
 mean: -21.44994, std: 0.08163, params: {'min_samples_split': 100},
 mean: -21.46662, std: 0.25857, params: {'min_samples_split': 120}]

In [352]:
rfc.best_params_


Out[352]:
{'min_samples_split': 60}

In [353]:
rfc = rfc.best_estimator_

Writing Ans


In [390]:
final_columns = columns + ['last2']
newY = Y.copy()
newX = X.copy()
newX = newX[newY['GAP']>0]
newY = newY[newY['GAP']>0]
newY[newY['GAP']>39]=40

In [404]:
# params = {'criterion': 'entropy', 'n_estimators':40, 'min_samples_leaf':10, 'min_samples_split':100,
#           'max_depth':9, 'max_features':0.85, 'min_samples_split': 20, 'random_state':1,
#           'verbose':0, 'class_weight': dict(zip(range(1,20), range(20,1,-1)))}

rfc = RandomForestClassifier(**params)
rfc.fit(newX[columns], newY['GAP'])


Out[404]:
RandomForestClassifier(bootstrap=True,
            class_weight={1: 20, 2: 19, 3: 18, 4: 17, 5: 16, 6: 15, 7: 14, 8: 13, 9: 12, 10: 11, 11: 10, 12: 9, 13: 8, 14: 7, 15: 6, 16: 5, 17: 4, 18: 3, 19: 2},
            criterion='entropy', max_depth=9, max_features=0.85,
            max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [531]:
def select_last_points_ans(day, slot):
    slot = np.array(slot)
    x = test_order[day].reindex(index(D_range, slot)).fillna(0)
    x['last'] = test_order[day].reindex(index(D_range, slot-1)).fillna(0)
    x['last2'] = test_order[day].reindex(index(D_range, slot-2)).fillna(0)
    x = pd.concat([x, test_traffic[day].reindex(index(D_range, slot))],axis=1)
    # For missing traffic data on district 54, replaced by Mask
    for t in slot:
        x.loc[(54,t)]['LV1':'weekday'] = x.loc[(Mask,t)]['LV1':'weekday']
    x = x.join(test_weather[day], on='time')
    x = x.join(POI,on='district')
    print "Select data from {} on {}".format(day, slot)
    print "\t shape: {}".format(x.shape)
    return x

In [532]:
def write5(x, day, slot, rfc, mode):
    with open('ans5_v2.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in D_range:
            for S in slot:
                gap = rfc.predict(x.loc[(D,S)].reshape(1, -1))[0]
                writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])

In [534]:
for day in S_range.keys():
    x = select_last_points_ans(day, S_range[day])
    write5(x[columns], day, S_range[day], rfc, 'a')


Select data from 2016-01-31 on [ 45  57  69  81  93 105 117 129 141]
	 shape: (594, 39)
Select data from 2016-01-23 on [ 45  57  69  81  93 105 117 129 141]
	 shape: (594, 39)
Select data from 2016-01-29 on [ 57  69  81  93 105 117 129 141]
	 shape: (528, 39)
Select data from 2016-01-27 on [ 45  57  69  81  93 105 117 129 141]
	 shape: (594, 39)
Select data from 2016-01-25 on [ 57  69  81  93 105 117 129 141]
	 shape: (528, 39)

Method4 by GradientBoosting on classification


In [ ]:
from sklearn.ensemble import GradientBoostingClassifier

params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
          'verbose':1}
grd = GradientBoostingClassifier(**params)
grd.fit(X_train,y_train)

In [200]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    y_pred = grd.predict(x[columns])
    score = score_GBD(day, y_pred)
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838


Select data from 2016-01-31 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.468503175914
Select data from 2016-01-23 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.476817064303
Select data from 2016-01-29 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.514905471381
Select data from 2016-01-27 on [ 44  56  68  80  92 104 116 128 140]
	 shape: (594, 38)
	 score: 0.471597751025
Select data from 2016-01-25 on [ 56  68  80  92 104 116 128 140]
	 shape: (528, 38)
	 score: 0.488898065986
0.483318377212

In [ ]:
params = {'loss': 'deviance', 'learning_rate': 0.1, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1}
clf = GradientBoostingClassifier(**params)
clf.fit(X,Y)

Select important columns


In [ ]:
columns = []
for i, important in enumerate(clf.feature_importances_ > 0.02):
    if important:
        columns.append(X.columns[i])

In [ ]:
params = {'loss': 'deviance', 'learning_rate': 0.15, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
          'verbose':1}
clf = GradientBoostingClassifier(**params)
clf.fit(X[columns],Y)

In [ ]:
# learning rate: 0.1 -> 0.05 : 0.498091376721 -> 0.50989613439
# learning rate: 0.1 -> 0.15 : 0.498091376721 -> 0.50989613439
get_score(clf, columns)

Writing ANS


In [ ]:
def write4(x, day, slot, clf, mode):
    with open('ans4_v1.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in D_range:
            for S in slot:
                if D is 54:
                    df = test_order[day].reindex(index(D_range,T_range)).fillna(0)['gap']
                    gap = 0.5 * df.loc[(D, S)] + 0.3 * df.loc[(D, S-1)] + 0.2 * df.loc[(D, S-2)]
                else:
                    gap = clf.predict(x.loc[(D,S)].reshape(1, -1))[0]
                gap = 0 if gap < 0 else gap
                writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])

In [ ]:
SLOT1 = range(45,153,12)
SLOT2 = range(57,153,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    write4(x[columns], day, S_range[day], clf, 'a')

Feature transformations with ensembles of trees (cont.)


In [ ]:
def score5(day, pred):
    Ds = range(1,67)
    Ds.reomve(54)
    ans = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = ans['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    # For district 54
    another = test_order[day].reindex(index(54, S_range[day])).fillna(0)
    
    return np.fabs(gap).sum()/ans.shape[0]

def get_score(clf, columns):
    SLOT1 = range(44,152,12)
    SLOT2 = range(56,152,12)
    RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
    scores = []
    for day in RANGE.keys():
        x = select_last_points(day, RANGE[day])
        score = score5(day, clf.predict(x[columns].dropna()))
        scores.append(score * x.shape[0])
        print '\t score: {}'.format(score)
    print np.array(scores).sum()/2838

In [ ]:
Y = Y_gap[Y_gap>0]
X = X[Y_gap>0]
Y[Y>12]=13

In [ ]:
# columns = ['gap', 'last', 'LV1', 'LV2', 'LV3', 'LV4', 'district', 'time', 'pm2.5']
new_columns = columns + ['last2']

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X[columns], Y, test_size=0.7)

params = {'loss': 'deviance', 'learning_rate': 0.15, 'n_estimators': 50, 'min_samples_leaf':10, 'min_samples_split':100,
          'max_depth': 3, 'max_features': 0.8, 'subsample': 1.0, 'min_samples_split': 20, 'random_state':1,
          'verbose':1}
grd = GradientBoostingClassifier(**params)
grd.fit(X_train,y_train)

In [ ]:
x = select_last_points(day, RANGE['2016-01-23'])
print x[columns].loc[(54,RANGE['2016-01-23']),:]
#print get_score(grd, columns)
# print ((y_train_lr - grd.predict(X_train_lr)).abs() / y_train_lr).sum() / (66*y_train_lr.shape[0])

In [ ]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
enc = OneHotEncoder()
lm_params = {'class_weight':{1:16,2:14,3:12,4:10,5:8}, 'solver':'newton-cg', 'random_state':1,
             'penalty':'l2'}
lm = LogisticRegression(**lm_params)
enc.fit(grd.apply(X_train)[:, :, 0])
lm.fit(enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
# lm.fit(X_train_lr, y_train_lr)

In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    y_pred_grd_lm = lm.predict_proba(enc.transform(grd.apply(x[columns])[:, :, 0]))[:, 1]
    score = score3(day, pd.Series(y_pred_grd_lm, index=x.index))
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838

In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
ans = pd.DataFrame()
scores = pd.DataFrame()
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    y_pred_grd_lm = lm.predict_proba(enc.transform(grd.apply(x[columns])[:, :, 0]))[:, 1]
    a, s = score4(day, pd.Series(y_pred_grd_lm, index=x.index))
    ans = pd.concat([ans, a], axis=1)
    scores = pd.concat([scores, s], axis=1)

In [ ]:
worst = scores[(scores>=0.8).any(1)]
print ans.loc[worst.index]['2016-01-23'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-25'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-27'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-29'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-31'].value_counts()[:10]

Explore which gap is the most common errors


In [ ]:
def score4(day, pred):
    a = test_order[day].reindex(index(D_range, S_range[day])).fillna(0)
    ans = a['gap'].values
    pred = pred[ans>0]
    ans = ans[ans>0]
    gap = (ans - pred) / ans
    temp = pd.DataFrame(ans, index=pred.index,columns=[day])
    return temp, pd.DataFrame(np.fabs(gap),columns=[day])

In [ ]:
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
ans = pd.DataFrame()
scores = pd.DataFrame()
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    a, s = score4(day, pd.Series(clf.predict(x[columns]), index=x.index))
    ans = pd.concat([ans, a], axis=1)
    scores = pd.concat([scores, s], axis=1)

In [ ]:
worst = scores[(scores>=0.8).any(1)]
print ans.loc[worst.index]['2016-01-23'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-25'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-27'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-29'].value_counts()[:10]
print ans.loc[worst.index]['2016-01-31'].value_counts()[:10]

Method3 by GradientBoosting on regression


In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
slot1 = range(45,153,12) # Last time slot of test slot for day 23, 27, 31
slot2 = range(57,153,12) # Last time slot of test slot for day 25, 29
S_range = {'2016-01-23':slot1, '2016-01-25':slot2, '2016-01-27':slot1, '2016-01-29':slot2, '2016-01-31':slot1}

In [ ]:
def write3(x, day, slot, regr, mode):
    with open('ans3_v1.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in D_range:
            for S in slot:
                key = (D,S)
                if key in x.index:
                    gap = x['gap'].loc[key] + regr.predict(x.loc[key].reshape(1, -1))[0]
                else:
                    gap = 1
                gap = 0 if gap < 0 else gap
                writer.writerow([str(D),'{}-{}'.format(day,S+1), '{:.15f}'.format(gap)])

Training Regressor


In [ ]:
# ans3_v1.csv
# params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':250, 'max_features':0.5, 'random_state':1,
#           'warm_start':False,'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}
#-----brand new turing
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':20, 'max_features':1.0, 'random_state':1,
          'warm_start':False,'max_depth': 10, 'learning_rate': 0.25, 'subsample': 0.85,
          'min_samples_leaf': 25, 'min_samples_split':100}

# Paramters need to be searched
parameters = {'n_estimators': np.arange(100,600,150)}

In [ ]:
# Searching best parameters
# scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
# regr = grid_search.GridSearchCV(GradientBoostingRegressor(**params), 
#                                 param_grid=parameters, scoring=scoring_function, cv=3)
# regr.fit(X, Y)
# Regr = regr.best_estimator_
from sklearn.ensemble import GradientBoostingRegressor
params = {'loss': 'huber', 'alpha': 0.9, 'n_estimators':250, 'max_features':0.5, 'random_state':1,
          'warm_start':False,'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0}
Regr = GradientBoostingRegressor(**params)
Regr.fit(X, Y)
print Regr

Predict Score


In [ ]:
# max_features: sqrt -> 0.5 : 0.538287 -> 0.528706599387
# max_features: 0.5 -> 0.8 : 0.528706599387 -> 0.530272920336
#-----brand new turing
# alpha: 0.9 -> 0.6 : 0.537699383041 -> 0.576867122284
# learning_rate: 0.2 : 0.536744595218
SLOT1 = range(44,152,12)
SLOT2 = range(56,152,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    score = score3(day, x['gap']+Regr.predict(x))
    scores.append(score * x.shape[0])
    print '\t score: {}'.format(score)
print np.array(scores).sum()/2838

Writing ANS


In [ ]:
SLOT1 = range(45,153,12)
SLOT2 = range(57,153,12)
RANGE = {'2016-01-23': SLOT1, '2016-01-25': SLOT2, '2016-01-27': SLOT1, '2016-01-29': SLOT2, '2016-01-31': SLOT1}
scores = []
for day in RANGE.keys():
    x = select_last_points(day, RANGE[day])
    write3(x, day, S_range[day], Regr, 'a')

Naive Method 2 by using interpolation


In [ ]:
slot1 = range(45,153,12)
slot2 = range(57,153,12)
S_range = {'2016-01-22':slot1, '2016-01-24':slot2, '2016-01-26':slot1, '2016-01-28':slot2, '2016-01-30':slot1}
D_range = range(1,67)

In [ ]:
Day_range = {'2016-01-22':pd.date_range('1/2/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score2(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        data = data.loc[ans.index]
        deltas.append(data)

    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta.add(deltas[i], fill_value=0)
    pred = test_order[day].shift().loc[ans.index].fillna(0)+(delta/len(deltas))
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [ ]:
print naive_score2('2016-01-22')
print naive_score2('2016-01-24')
print naive_score2('2016-01-26')
print naive_score2('2016-01-28')
print naive_score2('2016-01-30')

In [ ]:
def slope(day):
    base_points = test_order[day].select(lambda x: x[1] in S_range[day])
    deltas = []
    for d in Day_range[day]:
        data = train_order[str(d.date())].reindex([x for x in itertools.product(D_range,range(1,145))]).fillna(0)
        data = data.diff().shift(-1)
        deltas.append(data)
    delta = deltas[0]
    for i in range(1,len(deltas)):
        delta = delta + deltas[i]
    return base_points, delta/len(deltas)

In [ ]:
def write2(day, base_points, slot, delta, mode):
    with open('ans.csv', mode) as f:
        writer = csv.writer(f, delimiter=',')
        for D in range(1,67):
            for S in slot:
                key = (D,S-1)
                if key in base_points.index:
                    gap = base_points['gap'].loc[key] + delta['gap'].loc[key]
                    gap = base_points['gap'].loc[key] if gap < 0 else gap
                else:
                    gap = 0.0
                writer.writerow([D,'{}-{}'.format(day,S), '{:.3f}'.format(gap)])

Predict Day22


In [ ]:
day = '2016-01-22'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'w')

Predict Day24


In [ ]:
day = '2016-01-24'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

Predict Day26


In [ ]:
day = '2016-01-26'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

Predict Day28


In [ ]:
day = '2016-01-28'
base_points, delta = slope(day)
write2(day, base_points, test_slot2, delta, 'a')

Predict Day30


In [ ]:
day = '2016-01-30'
base_points, delta = slope(day)
write2(day, base_points, test_slot1, delta, 'a')

Predict the score of naive method 1


In [ ]:
Day_range = {'2016-01-22':pd.date_range('1/1/2016', periods=3, freq='7D'),
             '2016-01-24':pd.date_range('1/3/2016', periods=3, freq='7D'),
             '2016-01-26':pd.date_range('1/5/2016', periods=3, freq='7D'),
             '2016-01-28':pd.date_range('1/7/2016', periods=3, freq='7D'),
             '2016-01-30':pd.date_range('1/9/2016', periods=2, freq='7D')}
def naive_score1(day):
    ans = test_order[day].select(lambda x: x[1] in S_range[day])
    
    prediction = []
    for d in Day_range[day]:
        data = train_order[str(d.date())]
        data = data.loc[ans.index].fillna(0)
        prediction.append(data)

    pred = prediction[0]
    for i in range(1,len(prediction)):
        pred.add(prediction[i], fill_value=0)
    pred = pred/len(prediction)
    gap = (ans - pred) / ans

    return gap.abs().sum()/(66*len(S_range[day]))

In [ ]:
print naive_score1('2016-01-22')
print naive_score1('2016-01-24')
print naive_score1('2016-01-26')
print naive_score1('2016-01-28')
print naive_score1('2016-01-30')

Naive Method 1 by using mean


In [ ]:
test_slot1 = range(46,154,12)
test_slot2 = range(58,154,12)

Predict Day22


In [ ]:
test_day22 = []

In [ ]:
for day in pd.date_range('1/1/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day22.append(order.groupby(['district', 'time']))

In [ ]:
with open('ans.csv', 'w') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day22:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-22-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

Predict Day24


In [ ]:
test_day24 = []

In [ ]:
for day in pd.date_range('1/3/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day24.append(order.groupby(['district', 'time']))

In [ ]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day24:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-24-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

Predict Day26


In [ ]:
test_day26 = []

In [ ]:
for day in pd.date_range('1/5/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day26.append(order.groupby(['district', 'time']))

In [ ]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day26:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-26-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

Predict Day28


In [ ]:
test_day28 = []

In [ ]:
for day in pd.date_range('1/7/2016', periods=3, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day28.append(order.groupby(['district', 'time']))

In [ ]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot2:
            key = (D,S)
            gap = []
            for data in test_day28:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-28-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

Predict Day30


In [ ]:
test_day30 = []

In [ ]:
for day in pd.date_range('1/9/2016', periods=2, freq='7D'):
    # Read data
    order = pd.read_table(path['order'].format(day.date()), header=None, usecols=[1,3,6],
                      names=['driver', 'district', 'time'])
    # Select NA for calculating the value of gap
    order = order[order['driver'].isnull()] 
    # Translating hash to id
    order['district'] = order['district'].apply(lambda x: district[x])
    # Translating timestamp to time_id
    order['time'] = pd.to_datetime(order['time'])
    order['time'] = (order['time'] - day) / M / 10 + 1
    order['time'] = order['time'].astype(int)
    # Grouping by district and time
    test_day30.append(order.groupby(['district', 'time']))

In [ ]:
with open('ans.csv', 'a') as f:
    writer = csv.writer(f, delimiter=',')
    for D in range(1,67):
        for S in test_slot1:
            key = (D,S)
            gap = []
            for data in test_day30:
                if key in data.groups:
                    gap.append(data.get_group(key).shape[0])
                else:
                    gap.append(0)
            writer.writerow([D,'2016-01-30-{}'.format(S), '{:.3f}'.format(np.mean(gap))])

Process order


In [ ]:
day = '2016-01-01'
Day = pd.Timestamp(day)

In [ ]:
order = pd.read_table(path['order'].format(day), header=None, usecols=[1,3,6],
                      names=['driver', 'district_id', 'time'])
order = order[order['driver'].isnull()] # Select NA for calculating the value of gap

Translating district hash to id


In [ ]:
order['district_id'] = order['district_id'].apply(lambda x: district[x])

Translating timestamp to slot


In [ ]:
order['time'] = pd.to_datetime(order['time'])

In [ ]:
order['time_slot'] = (order['time'] - Day) / M / 10 + 1
order['time_slot'] = order['time_slot'].astype(int)