http://chriswhong.com/open-data/foil_nyc_taxi/
http://www.andresmh.com/nyctaxitrips/
Potential models
In [1]:
import pandas
%pylab inline
In [2]:
N = 5e6
data = pandas.read_csv("trip_data_1.csv", nrows=N)
In [3]:
fare_data = pandas.read_csv("trip_fare_1.csv", nrows=N)
fare_cols = [u' payment_type', u' fare_amount', u' surcharge', u' mta_tax', u' tip_amount', u' tolls_amount', u' total_amount']
data = data.join(fare_data[fare_cols])
del fare_data
data[:10]
Out[3]:
In [46]:
data.ix[:5, data.columns[:5]]
Out[46]:
In [5]:
data.plot(x="trip_time_in_secs", y=" total_amount", kind="scatter", s=2)
xlim(0,1e4)
ylim(0,300)
Out[5]:
In [44]:
ind = where(logical_and(data.trip_time_in_secs < 500, data[' total_amount'] > 30))[0]
data = data.drop(ind)
In [218]:
data[logical_and(data.dropoff_latitude > 40.6,data.dropoff_latitude < 40.9)].dropoff_latitude.hist(bins=20);
In [231]:
data[logical_and(data.dropoff_longitude > -74.05,data.dropoff_longitude < -73.9)].dropoff_longitude.hist(bins=20);
In [3]:
data.vendor_id.value_counts().plot(kind="bar");
In [4]:
data.rate_code.value_counts().plot(kind="bar", logy=True, ylim=(1,1e8));
In [5]:
data.store_and_fwd_flag.value_counts().plot(kind="bar");
In [16]:
data.passenger_count.value_counts().plot(kind="bar");
In [19]:
data.trip_time_in_secs[data.trip_time_in_secs < 4000].hist(bins=30);
In [21]:
data.trip_distance[data.trip_distance < 22].hist(bins=30);
In [6]:
data[' payment_type'].value_counts().plot(kind="bar", logy=True, ylim=(1,1e8));
In [6]:
data.plot(x="trip_time_in_secs", y="trip_distance", kind="scatter", s=2)
xlim(0,5000)
ylim(0,40)
Out[6]:
In [9]:
figure(figsize=(16,8))
plot(data["pickup_latitude"], data["pickup_longitude"], 'b,')
xlim(40.6, 40.9)
ylim(-74.05, -73.9)
Out[9]:
In [6]:
data[data[' tip_amount'] < 15][' tip_amount'].hist(bins=30);
In [4]:
len(data)
data = data[data[' payment_type'] != "CSH"]
data.reset_index(inplace=True, drop=True)
len(data)
Out[4]:
In [5]:
# Setup target
data['tipped'] = (data[' tip_amount'] > 0).astype("int")
data['tipped'].value_counts()
Out[5]:
In [6]:
feats1 = [u'rate_code', 'passenger_count', u'trip_time_in_secs', u'trip_distance', u'pickup_longitude', u'pickup_latitude', u'dropoff_longitude', u'dropoff_latitude', ' fare_amount', u' surcharge', u' mta_tax', ' tolls_amount']
In [7]:
M = len(data)
rand_idx = arange(M)
random.shuffle(rand_idx)
train_idx = rand_idx[int(M*0.2):]
test_idx = rand_idx[:int(M*0.2)]
In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
In [15]:
sc = StandardScaler()
data_scaled = sc.fit_transform(data[feats1])
data_scaled[train_idx.tolist(),:].shape
Out[15]:
In [27]:
sgd = SGDClassifier(loss="modified_huber")
sgd.fit(data.ix[train_idx,feats1], data['tipped'].ix[train_idx])
Out[27]:
In [28]:
preds = sgd.predict_proba(data.ix[test_idx,feats1])
In [29]:
fpr, tpr, thr = roc_curve(data['tipped'].ix[test_idx], preds[:,1])
auc = roc_auc_score(data['tipped'].ix[test_idx], preds[:,1])
In [30]:
auc
Out[30]:
In [41]:
plot(fpr,tpr)
plot(fpr,fpr)
xlabel("False positive rate")
ylabel("True positive rate")
Out[41]:
In [9]:
from sklearn.ensemble import RandomForestClassifier
In [10]:
data.fillna(0, inplace=True)
count_nonzero(pandas.isnull(data.ix[train_idx,feats1]))
Out[10]:
In [33]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf1.fit(data.ix[train_idx,feats1], data['tipped'].ix[train_idx])
Out[33]:
In [34]:
preds1 = rf1.predict_proba(data.ix[test_idx,feats1])
In [21]:
from sklearn.metrics import roc_curve, roc_auc_score
In [36]:
fpr1, tpr1, thr1 = roc_curve(data['tipped'].ix[test_idx], preds1[:,1])
auc1 = roc_auc_score(data['tipped'].ix[test_idx], preds1[:,1])
In [ ]:
print auc1
rf1.score(data.ix[test_idx,feats1], data.ix[test_idx,'tipped'])
In [42]:
plot(fpr1,tpr1)
plot(fpr1,fpr1)
xlabel("False positive rate")
ylabel("True positive rate")
Out[42]:
In [44]:
fi = zip(feats1, rf1.feature_importances_)
fi.sort(key=lambda x: -x[1])
pandas.DataFrame(fi, columns=["Feature","Importance"])
Out[44]:
In [45]:
data['trip_time_in_secs'][data['trip_time_in_secs'] < 1e-3] = -1
data['speed'] = data['trip_distance'] / data['trip_time_in_secs']
In [46]:
feats2 = feats1 + ['speed']
feats2.remove('trip_time_in_secs')
In [ ]:
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf2.fit(data.ix[train_idx,feats2], data['tipped'].ix[train_idx])
In [ ]:
preds2 = rf2.predict_proba(data.ix[test_idx,feats2])
In [ ]:
fpr2, tpr2, thr2 = roc_curve(data['tipped'].ix[test_idx], preds2[:,1])
auc2 = roc_auc_score(data['tipped'].ix[test_idx], preds2[:,1])
In [ ]:
print auc2
plot(fpr2,tpr2)
plot(fpr2,fpr2)
In [ ]:
fi2 = zip(feats2, rf2.feature_importances_)
fi2.sort(key=lambda x: x[1])
fi2
In [11]:
feats3 = feats1
In [12]:
feats3
Out[12]:
In [13]:
from sklearn.feature_extraction import DictVectorizer
In [14]:
def cat_to_num(data):
categories = unique(data)
features = {}
for cat in categories:
binary = (data == cat)
features["%s:%s"%(data.name, cat)] = binary.astype("int")
return pandas.DataFrame(features)
In [15]:
payment_type_cats = cat_to_num(data[' payment_type'])
vendor_id_cats = cat_to_num(data['vendor_id'])
store_and_fwd_flag_cats = cat_to_num(data['store_and_fwd_flag'])
rate_code = cat_to_num(data['rate_code'])
In [16]:
data = data.join(payment_type_cats)
feats3 += payment_type_cats.columns
data = data.join(vendor_id_cats)
feats3 += vendor_id_cats.columns
data = data.join(store_and_fwd_flag_cats)
feats3 += store_and_fwd_flag_cats.columns
data = data.join(rate_code)
feats3 += rate_code.columns
In [17]:
feats3
Out[17]:
In [18]:
rf3 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf3.fit(data.ix[train_idx,feats3], data['tipped'].ix[train_idx])
Out[18]:
In [ ]:
rf3.score(data.ix[test_idx,feats3], data.ix[test_idx,'tipped'])
In [23]:
fpr3, tpr3, thr3 = roc_curve(data['tipped'].ix[test_idx], preds3[:,1])
auc3 = roc_auc_score(data['tipped'].ix[test_idx], preds3[:,1])
print auc3
plot(fpr3,tpr3)
plot(fpr3,fpr3)
xlabel("False positive rate")
ylabel("True positive rate")
Out[23]:
In [25]:
fi3 = zip(feats3, rf3.feature_importances_)
fi3.sort(key=lambda x: -x[1])
pandas.DataFrame(fi3, columns=["Feature","Importance"])
Out[25]:
In [26]:
feats4 = feats3
In [27]:
# Datetime features (hour of day, day of week, week of year)
pickup = pandas.to_datetime(data['pickup_datetime'])
dropoff = pandas.to_datetime(data['dropoff_datetime'])
data['pickup_hour'] = pickup.apply(lambda e: e.hour)
data['pickup_day'] = pickup.apply(lambda e: e.dayofweek)
#data['pickup_week'] = pickup.apply(lambda e: e.week)
data['dropoff_hour'] = dropoff.apply(lambda e: e.hour)
data['dropoff_day'] = dropoff.apply(lambda e: e.dayofweek)
#data['dropoff_week'] = dropoff.apply(lambda e: e.week)
In [28]:
feats4 += ['pickup_hour', 'pickup_day', 'dropoff_hour', 'dropoff_day']
In [15]:
feats4
Out[15]:
In [29]:
rf4 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf4.fit(data.ix[train_idx,feats4], data['tipped'].ix[train_idx])
Out[29]:
In [33]:
preds4 = rf4.predict_proba(data.ix[test_idx,feats4])
rf4.score(data.ix[test_idx,feats4], data.ix[test_idx,'tipped'])
Out[33]:
In [31]:
fpr4, tpr4, thr4 = roc_curve(data['tipped'].ix[test_idx], preds4[:,1])
auc4 = roc_auc_score(data['tipped'].ix[test_idx], preds4[:,1])
print auc4
figure(figsize=(14,8))
plot(fpr4,tpr4, "g-", linewidth=3)
plot(fpr4,fpr4, "k-", linewidth=1)
xlabel("False positive rate")
ylabel("True positive rate")
Out[31]:
In [32]:
fi4 = zip(feats4, rf4.feature_importances_)
fi4.sort(key=lambda x: -x[1])
pandas.DataFrame(fi4, columns=["Feature","Importance"])
Out[32]:
In [59]:
data.ix[data[' payment_type:CSH'] == 1,'tipped'].value_counts()
Out[59]:
In [242]:
figure(figsize=(16,8))
plot(data[data['tipped'] == True]["dropoff_latitude"], data[data['tipped'] == True]["dropoff_longitude"], 'b,')
plot(data[data['tipped'] == False]["dropoff_latitude"], data[data['tipped'] == False]["dropoff_longitude"], 'r,')
xlim(40.6, 40.9)
ylim(-74.05, -73.9)
Out[242]: