In [32]:
import pandas as pd

In [33]:
df = pd.DataFrame({'name': ['Alice', 'Bob', 'Charlie', 'Danielle'],
                   'balance': [100.0, 200.0, 300.0, 400.0],
                   'gender': ['Female', 'Male', 'Male', 'Female']},
                   columns=['name', 'balance', 'gender'])

In [34]:
df.dtypes


Out[34]:
name        object
balance    float64
gender      object
dtype: object

In [35]:
df['gender'] = df['gender'].astype('category')

In [36]:
df


Out[36]:
name balance gender
0 Alice 100 Female
1 Bob 200 Male
2 Charlie 300 Male
3 Danielle 400 Female

In [37]:
df.dtypes


Out[37]:
name         object
balance     float64
gender     category
dtype: object

In [38]:
df.gender


Out[38]:
0    Female
1      Male
2      Male
3    Female
Name: gender, dtype: category
Categories (2, object): [Female, Male]

In [39]:
df.gender.cat.categories


Out[39]:
Index([u'Female', u'Male'], dtype='object')

In [40]:
df.gender.cat.codes


Out[40]:
0    0
1    1
2    1
3    0
dtype: int8


In [41]:
df = pd.read_csv('trip_data_1.csv')

In [42]:
df.head()


Out[42]:
medallion hack_license vendor_id rate_code store_and_fwd_flag pickup_datetime dropoff_datetime passenger_count trip_time_in_secs trip_distance pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude
0 89D227B655E5C82AECF13C3F540D4CF4 BA96DE419E711691B9445D6A6307C170 CMT 1 N 2013-01-01 15:11:48 2013-01-01 15:18:10 4 382 1.0 -73.978165 40.757977 -73.989838 40.751171
1 0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N 2013-01-06 00:18:35 2013-01-06 00:22:54 1 259 1.5 -74.006683 40.731781 -73.994499 40.750660
2 0BD7C8F5BA12B88E0B67BED28BEA73D8 9FD8F69F0804BDB5549F40E9DA1BE472 CMT 1 N 2013-01-05 18:49:41 2013-01-05 18:54:23 1 282 1.1 -74.004707 40.737770 -74.009834 40.726002
3 DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N 2013-01-07 23:54:15 2013-01-07 23:58:20 2 244 0.7 -73.974602 40.759945 -73.984734 40.759388
4 DFD2202EE08F7A8DC9A57B02ACB81FE2 51EE87E3205C985EF8431D850C786310 CMT 1 N 2013-01-07 23:25:03 2013-01-07 23:34:24 1 560 2.1 -73.976250 40.748528 -74.002586 40.747868

In [43]:
%time
df.groupby(df.medallion).trip_distance.sum().sort(ascending=False, inplace=False).head()


CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 59.1 µs
Out[43]:
medallion
6945300E90C69061B463CCDA370DE5D6    5391.43
6BD1B641A1CD55803A21560299B985A7    5359.30
8211BE04462ADE4621B68E1DFEA54754    5342.80
0076C8327A95E988E721AC33B0FA9D67    5276.70
6B19837BFEAA374B7B4694799D7CC746    5149.70
Name: trip_distance, dtype: float64

In [44]:
%time df['medallion'] = df['medallion'].astype('category')


CPU times: user 2.64 s, sys: 1.7 s, total: 4.34 s
Wall time: 5.05 s

In [45]:
%time
df.groupby(df.medallion).trip_distance.sum().sort(ascending=False, inplace=False).head()


CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs
Out[45]:
medallion
6945300E90C69061B463CCDA370DE5D6    5391.43
6BD1B641A1CD55803A21560299B985A7    5359.30
8211BE04462ADE4621B68E1DFEA54754    5342.80
0076C8327A95E988E721AC33B0FA9D67    5276.70
6B19837BFEAA374B7B4694799D7CC746    5149.70
Name: trip_distance, dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: