In [133]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing

In [22]:
air_raw = DataFrame.from_csv("allyears_tiny.csv", index_col = False)
print(air_raw.head())

air_raw['RandNum'] = Series(np.random.uniform(size = len(air_raw['Origin'])))
print(air_raw.head())


   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  1987     10          14          3      741         730      912   
1  1987     10          15          4      729         730      903   
2  1987     10          17          6      741         730      918   
3  1987     10          18          7      729         730      847   
4  1987     10          19          1      749         730      922   

   CRSArrTime UniqueCarrier  FlightNum      ...       Cancelled  \
0         849            PS       1451      ...               0   
1         849            PS       1451      ...               0   
2         849            PS       1451      ...               0   
3         849            PS       1451      ...               0   
4         849            PS       1451      ...               0   

   CancellationCode  Diverted  CarrierDelay  WeatherDelay  NASDelay  \
0               NaN         0           NaN           NaN       NaN   
1               NaN         0           NaN           NaN       NaN   
2               NaN         0           NaN           NaN       NaN   
3               NaN         0           NaN           NaN       NaN   
4               NaN         0           NaN           NaN       NaN   

  SecurityDelay LateAircraftDelay  IsArrDelayed  IsDepDelayed  
0           NaN               NaN           YES           YES  
1           NaN               NaN           YES            NO  
2           NaN               NaN           YES           YES  
3           NaN               NaN            NO            NO  
4           NaN               NaN           YES           YES  

[5 rows x 31 columns]
   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  1987     10          14          3      741         730      912   
1  1987     10          15          4      729         730      903   
2  1987     10          17          6      741         730      918   
3  1987     10          18          7      729         730      847   
4  1987     10          19          1      749         730      922   

   CRSArrTime UniqueCarrier  FlightNum    ...     CancellationCode  Diverted  \
0         849            PS       1451    ...                  NaN         0   
1         849            PS       1451    ...                  NaN         0   
2         849            PS       1451    ...                  NaN         0   
3         849            PS       1451    ...                  NaN         0   
4         849            PS       1451    ...                  NaN         0   

   CarrierDelay  WeatherDelay  NASDelay  SecurityDelay LateAircraftDelay  \
0           NaN           NaN       NaN            NaN               NaN   
1           NaN           NaN       NaN            NaN               NaN   
2           NaN           NaN       NaN            NaN               NaN   
3           NaN           NaN       NaN            NaN               NaN   
4           NaN           NaN       NaN            NaN               NaN   

  IsArrDelayed  IsDepDelayed   RandNum  
0          YES           YES  0.193944  
1          YES            NO  0.466327  
2          YES           YES  0.943457  
3           NO            NO  0.232673  
4          YES           YES  0.133799  

[5 rows x 32 columns]

In [122]:
air_mapped = DataFrame()

air_mapped['RandNum'] = air_raw['RandNum']

air_mapped['IsDepDelayed'] = air_raw['IsDepDelayed']
air_mapped['IsDepDelayedInt'] = air_mapped.apply(lambda row:
                                                 1 if row['IsDepDelayed'] == 'YES' else 0,
                                                 axis=1)
del air_mapped['IsDepDelayed']
print(air_mapped.shape)

lb_origin = sklearn.preprocessing.LabelBinarizer()
lb_origin.fit(air_raw['Origin'])
tmp_origin = lb_origin.transform(air_raw['Origin'])
tmp_origin_df = DataFrame(tmp_origin)
print(tmp_origin_df.shape)

lb_dest = sklearn.preprocessing.LabelBinarizer()
lb_dest.fit(air_raw['Dest'])
tmp_dest = lb_origin.transform(air_raw['Dest'])
tmp_dest_df = DataFrame(tmp_dest)
print(tmp_dest_df.shape)

lb_uniquecarrier = sklearn.preprocessing.LabelBinarizer()
lb_uniquecarrier.fit(air_raw['UniqueCarrier'])
tmp_uniquecarrier = lb_origin.transform(air_raw['UniqueCarrier'])
tmp_uniquecarrier_df = DataFrame(tmp_uniquecarrier)
print(tmp_uniquecarrier_df.shape)

air_mapped = pd.concat([
                        air_mapped, 
                        tmp_origin_df, 
                        tmp_dest_df, 
                        air_raw['Distance'],
                        tmp_uniquecarrier_df, 
                        air_raw['Month'],
                        air_raw['DayofMonth'],
                        air_raw['DayOfWeek'],
                        ],
                       axis=1)
print(air_mapped.shape)
air_mapped

air = air_mapped


(999, 2)
(999, 10)
(999, 10)
(999, 10)
(999, 36)

In [124]:
air_train = air.ix[air['RandNum'] <= 0.8]
# air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)]
air_test  = air.ix[air['RandNum'] > 0.9]

print(air_train.shape)
print(air_test.shape)


(824, 36)
(91, 36)

In [128]:
X_train = air_train.copy(deep=True)
del X_train['RandNum']
del X_train['IsDepDelayedInt']
print(list(X_train.columns.values))
print(X_train.shape)

y_train = air_train['IsDepDelayedInt']
print(y_train.shape)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']
(824, 34)
(824,)

In [132]:
clf = GradientBoostingClassifier(n_estimators = 10, max_depth = 3, learning_rate = 0.01)
clf.fit(X_train, y_train)


Out[132]:
GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2, n_estimators=10,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [137]:
X_test = air_test.copy(deep=True)
del X_test['RandNum']
del X_test['IsDepDelayedInt']
print(list(X_test.columns.values))
print(X_test.shape)

print("")
print("--- PREDICTIONS ---")
print("")
pred = clf.predict(X_test)
print(pred)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek']
(91, 34)

--- PREDICTIONS ---

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]