Complex operations



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
data = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',
                                                     'dropoff_datetime'])
fare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])

Group-by



In [2]:

    
weekly = data.groupby(data.pickup_datetime.dt.weekofyear)



In [3]:

    
len(weekly)









    Out[3]:





52



In [4]:

    
y = weekly.size()
y.head(3)









    Out[4]:





1    17042
2    15941
3    17017
dtype: int64



In [5]:

    
x = weekly.pickup_datetime.first()
x.head(3)









    Out[5]:





1   2013-01-01 00:00:00
2   2013-01-07 00:03:00
3   2013-01-14 00:00:51
Name: pickup_datetime, dtype: datetime64[ns]



In [6]:

    
pd.Series(y.values, index=x).plot()
plt.ylim(0)  # Set the lower y value to 0.
plt.xlabel('Week')  # Label of the x axis.
plt.ylabel('Taxi rides')  # Label of the y axis.

Joins



In [7]:

    
tip = fare[['medallion', 'tip_amount']] \
      .loc[fare.tip_amount>0].groupby('medallion').mean()
print(len(tip))
tip.head(3)









    Out[7]:





13407
                                  tip_amount
medallion
00005007A9F30E289E760362F69E4EAD    1.815854
000318C2E3E6381580E5C99910A60668    2.857222
000351EDC735C079246435340A54C7C1    2.099111



In [8]:

    
tip.hist(bins=np.linspace(0., 6., 100))
plt.xlabel('Average tip')
plt.ylabel('Number of taxis')



In [9]:

    
data_merged = pd.merge(data, tip, how='left', left_on='medallion', right_index=True)
data_merged.head(3)