Scroll down the the analysis section to see the actual analysis
In [2]:
1+1
Out[2]:
In [2]:
import pandas as pd
import numpy as np
import glob
import os
In [3]:
path=os.path.expanduser("~/Documents/TaxiTripData/2013taxi_trip_data/")
In [4]:
files=glob.glob(path+'*.csv.zip.gz')
In [5]:
countlist=[]
cunk=100000
for i in files:
print(i)
iterate=pd.read_csv(i,usecols=[5],iterator=True,chunksize=cunk,names=['pickup_datetime'],header=0)
counter=0
for chunk in iterate:
a=pd.to_datetime(chunk['pickup_datetime'])
if a.dtype==np.dtype('<M8[ns]'):
countlist.append(a.dt.date.value_counts())
else:
bads=[]
for q in range(a.shape[0]):
b=pd.to_datetime(a.iloc[q])
if type(b)!=pd.tslib.Timestamp:
bads.append(q)
a.drop(bads,axis=0,inplace=True)
a=pd.to_datetime(a)
if a.dtype==np.dtype('<M8[ns]'):
countlist.append(a.dt.date.value_counts())
else:
print('chunk ' + str(counter)+' of file '+i+' encountered a problem and was not processed.')
counter+=1
In [6]:
cond=pd.concat(countlist)
cond=cond.groupby(cond.index).sum()
Out[6]:
In [7]:
cond.index=pd.to_datetime(cond.index)
In [ ]:
%pwd
In [8]:
cond.to_pickle('2013_taxi_date_counts.pkl')
In [3]:
cond=pd.read_pickle('2013_taxi_date_counts.pkl')
Some thing seems to be up with the month of September in this data: it has almost double the counts of all the others. It also had some non-date data in the middle of the file. I expect that there may just be repeats in the middle of the file, but I just took it out for this analysis
In [4]:
cond[cond.index.month==9]=np.nan
cond.dropna(inplace=True)
In [12]:
months=pd.DataFrame(cond)
months['month']=months.index.month
months['year']=months.index.year
months=months.groupby(['year','month']).sum()
months.columns=['Pickups']
months=months.append(pd.DataFrame({'year' : [2013], 'month' : [9], 'Pickups':[0]}).groupby(['year','month']).sum()).sort()
In [13]:
plot=months.plot(kind='bar',title='Taxis trips fluctuate',color='gray',legend=False)
#plot.set_xlabel('Days of the week')
plot.set_ylabel('Trips')
plot.get_figure().savefig('yellowcabyear.pdf')
This fluctuates much more than I would expect, but I don't know if there is really much else to say about it.
In [6]:
dow=cond.copy()
dow.index=dow.index.weekday
dow=dow.groupby(dow.index).sum()
dow.index=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
plot=dow.plot(kind='bar',title='Taxis are for parties too',color='gray',legend=False)
plot.set_xlabel('Days of the week')
plot.set_ylabel('Trips')
plot.get_figure().savefig('yellowcabweek.pdf')
We see the same thing that we saw in the Uber data; taxis are popular for transportation to weekend entertainment, more so than for commuting.
In [ ]: