Manipulating data


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',
                                                     'dropoff_datetime'])
fare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])

Selecting data


In [2]:
data[['trip_distance', 'trip_time_in_secs']].head(3)


Out[2]:
   trip_distance  trip_time_in_secs
0           0.61                300
1           3.28                960
2           1.50                386

In [3]:
data.loc[0]


Out[3]:
medallion             76942C3205E17D7E7FE5A9F709D16434
hack_license          25BA06A87905667AA1FE5990E33F0E2E
vendor_id                                          VTS
rate_code                                            1
store_and_fwd_flag                                 NaN
pickup_datetime                    2013-01-01 00:00:00
dropoff_datetime                   2013-01-01 00:05:00
passenger_count                                      3
trip_time_in_secs                                  300
trip_distance                                     0.61
pickup_longitude                             -73.95592
pickup_latitude                               40.78189
dropoff_longitude                            -73.96318
dropoff_latitude                              40.77783
Name: 0, dtype: object

In [4]:
data.loc[[0, 100000]]

In [5]:
data.loc[1000:2000:10,
         ['trip_distance', 'trip_time_in_secs']]


Out[5]:
      trip_distance  trip_time_in_secs
1000           1.00                441
1010           3.80                691
....
1990           0.13                 60
2000           9.60                963

In [6]:
data.loc[data.trip_distance>50]

In [7]:
from ipywidgets import interact

In [8]:
@interact
def show_nrows(distance_threshold=(0, 200)):
    return len(data.loc[data.trip_distance > distance_threshold])

Computing with numbers


In [9]:
data['trip_time_in_mins'] = data.trip_time_in_secs / 60.0

In [10]:
data[['trip_time_in_secs', 'trip_time_in_mins']].head(3)


Out[10]:
   trip_time_in_secs  trip_time_in_mins
0                300           5.000000
1                960          16.000000
2                386           6.433333

In [11]:
a = data.trip_distance[:5]
a


Out[11]:
0    0.61
1    3.28
2    1.50
3    0.00
4    1.31
Name: trip_distance, dtype: float64

In [12]:
b = data.trip_distance[2:6]
b


Out[12]:
2    1.50
3    0.00
4    1.31
5    5.81
Name: trip_distance, dtype: float64

In [13]:
a + b


Out[13]:
0     NaN
1     NaN
2    3.00
3    0.00
4    2.62
5     NaN
Name: trip_distance, dtype: float64

Working with text


In [14]:
data.medallion.head(3)


Out[14]:
0    76942C3205E17D7E7FE5A9F709D16434
1    517C6B330DBB3F055D007B07512628B3
2    ED15611F168E41B33619C83D900FE266
Name: medallion, dtype: object

In [15]:
data.medallion.str.slice(0, 4).head(3)


Out[15]:
0    7694
1    517C
2    ED15
Name: medallion, dtype: object

Working with dates and times


In [16]:
data.pickup_datetime.dt.dayofweek[::200000]


Out[16]:
0         1
200000    6
400000    5
600000    0
800000    1
dtype: int64

In [17]:
day_p = data.pickup_datetime.dt.day
day_d = data.dropoff_datetime.dt.day
selection = (day_p != day_d)
print(len(data.loc[selection]))
data.loc[selection].head(3)


Out[17]:
7716

Handling missing data