In [13]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt
In [221]:
for i in ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']:
df = pd.read_csv('csv/'+i+'.csv')
print i,'\n',df.groupby('line').sum().sort(columns=['count'],ascending=False)['count'][:5]
In [603]:
line={}
In [601]:
lines=set(df.line)
In [50]:
mon=['jan','feb','mar','apr','may','jun']
for i in mon:
df = pd.read_csv('csv/'+i+'.csv')
del df['Unnamed: 0']
df.hour = map(lambda x: '0'+str(x) if x<10 else str(x),df.hour)
df.day = map(lambda x: '0'+str(x) if x<10 else str(x),df.day)
df.month = map(lambda x: '0'+str(x) if x<10 else str(x),df.month)
df['time']=map(lambda x,y,z: dt.datetime.strptime('2016'+x+y+z,'%Y%m%d%H'),
df.month,df.day,df.hour)
df['date']=map(lambda x,y: dt.datetime.strptime('2016'+x+y,'%Y%m%d').date(),
df.month,df.day)
df = df.sort(columns=['time'])
print i,'\n',df.groupby('line').sum().sort(columns=['count'],ascending=False)['count'][:5]
line_of_month = df.groupby('line').sum().sort(columns=['count'],ascending=False).index[:5]
plt.figure()
for j in line_of_month:
plt.plot(df[df['line']==j]['date'],df[df['line']==j]['count'],label=j)
plt.title(i.upper())
plt.legend()
plt.figure()
for j in line_of_month:
plt.plot(df[df['line']==j]['time'],df[df['line']==j]['count'],label=j)
plt.title(i.upper())
plt.legend()
In [624]:
for i in ['jan']:
df = pd.read_csv('csv/'+i+'.csv')
del df['Unnamed: 0']
df.hour = map(lambda x: '0'+str(x) if x<10 else str(x),df.hour)
df.day = map(lambda x: '0'+str(x) if x<10 else str(x),df.day)
df.month = map(lambda x: '0'+str(x) if x<10 else str(x),df.month)
df['time']=map(lambda x,y,z: dt.datetime.strptime('2016'+x+y+z,'%Y%m%d%H'),
df.month,df.day,df.hour)
df['date']=map(lambda x,y: dt.datetime.strptime('2016'+x+y,'%Y%m%d').date(),
df.month,df.day)
df = df.sort(columns=['time'])
for j in lines:
if (j in df.line.values)== False:
continue
line[j] = df[df['line']==j]
agg_date= pd.DataFrame(df.groupby('date').sum()['count'])
agg_time= pd.DataFrame(df.groupby('time').sum()['count'])
agg_day= pd.DataFrame(df.groupby('day_of_week').sum()['count'])
In [643]:
#line_of_month = ['C','1','E','6','R']
mon=['jan','feb','mar','apr','may','jun']
for i in mon[1:]:
df = pd.read_csv('csv/'+i+'.csv')
del df['Unnamed: 0']
df.hour = map(lambda x: '0'+str(x) if x<10 else str(x),df.hour)
df.day = map(lambda x: '0'+str(x) if x<10 else str(x),df.day)
df.month = map(lambda x: '0'+str(x) if x<10 else str(x),df.month)
df['time']=map(lambda x,y,z: dt.datetime.strptime('2016'+x+y+z,'%Y%m%d%H'),
df.month,df.day,df.hour)
df['date']=map(lambda x,y: dt.datetime.strptime('2016'+x+y,'%Y%m%d').date(),
df.month,df.day)
df = df.sort(columns=['time'])
#print i,'\n',df.groupby('line').sum().sort(columns=['count'],ascending=False)['count'][:3]
for j in lines:
if (j in df.line.values)== False:
continue
if j in line.keys():
line[j] = pd.concat([line[j],df[df['line']==j]])
else:
line[j] = df[df['line']==j]
agg_date= pd.concat([agg_date,pd.DataFrame(df.groupby('date').sum()['count'])])
agg_time = pd.concat([agg_time,pd.DataFrame(df.groupby('time').sum()['count'])])
agg_day = pd.concat([agg_day,pd.DataFrame(df.groupby('day_of_week').sum()['count'])],axis=1)
In [644]:
#line_of_month = ['C','1','E','6','R']
mon=['jul','aug','sep','oct','nov','dec']
for i in mon:
df = pd.read_csv('csv/'+i+'.csv')
del df['Unnamed: 0']
df.hour = map(lambda x: '0'+str(x) if x<10 else str(x),df.hour)
df.day = map(lambda x: '0'+str(x) if x<10 else str(x),df.day)
df.month = map(lambda x: '0'+str(x) if x<10 else str(x),df.month)
df['time']=map(lambda x,y,z: dt.datetime.strptime('2015'+x+y+z,'%Y%m%d%H'),
df.month,df.day,df.hour)
df['date']=map(lambda x,y: dt.datetime.strptime('2015'+x+y,'%Y%m%d').date(),
df.month,df.day)
df = df.sort(columns=['time'])
#print i,'\n',df.groupby('line').sum().sort(columns=['count'],ascending=False)['count'][:3]
for j in lines:
if (j in df.line.values)== False:
continue
if j in line.keys():
line[j] = pd.concat([line[j],df[df['line']==j]])
else:
line[j] = df[df['line']==j]
agg_date= pd.concat([agg_date,pd.DataFrame(df.groupby('date').sum()['count'])])
agg_time = pd.concat([agg_time,pd.DataFrame(df.groupby('time').sum()['count'])])
agg_day = pd.concat([agg_day,pd.DataFrame(df.groupby('day_of_week').sum()['count'])],axis=1)
In [645]:
print line['C'].count().sum()
print line['1'].count().sum()
In [273]:
line_C.plot('time','count',label='C',figsize=(20,5))
line_1.plot('time','count',label='1',figsize=(20,5))
line_E.plot('time','count',label='E',figsize=(20,5))
line_6.plot('time','count',label='6',figsize=(20,5))
line_R.plot('time','count',label='R',figsize=(20,5))
line_F.plot('time','count',label='F',figsize=(20,5))
Out[273]:
In [275]:
line_C.plot('date','count',label='C',figsize=(20,5))
line_1.plot('date','count',label='1',figsize=(20,5))
line_E.plot('date','count',label='E',figsize=(20,5))
line_6.plot('date','count',label='6',figsize=(20,5))
line_R.plot('date','count',label='R',figsize=(20,5))
line_F.plot('date','count',label='F',figsize=(20,5))
Out[275]:
In [ ]:
In [ ]:
In [282]:
agg_time.plot(figsize=(15,5),legend=False)
plt.title('Total trips by hour',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('time',fontsize=15)
Out[282]:
In [155]:
agg_date.plot(figsize=(15,5))
plt.title('How many yellow cab trips near subway stations?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[155]:
In [ ]:
In [554]:
agg_date.plot(figsize=(15,5))
plt.plot([dt.date(2015,7,1),dt.date(2016,6,30)],[3471,3471])
plt.title('How many yellow cab trips near subway stations?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[554]:
In [369]:
agg_time.loc['2015-12-20 00:00:00':'2015-12-29 00:00:00'].plot(figsize=(15,5))
plt.title('What happened on Christamas 2015 ?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[369]:
In [371]:
agg_time.loc['2016-01-20 00:00:00':'2016-01-27 00:00:00'].plot(figsize=(15,5))
plt.title('What happened on Jan 23, 2016 ?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[371]:
In [496]:
agg_date.loc[dt.date(2015,7,1):dt.date(2015,7,31)].plot(figsize=(15,5))
plt.title('Which days got the peak on July 2015 ?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[496]:
In [412]:
agg_time.loc['2015-07-18 00:00:00':'2015-07-21 00:00:00'].plot(figsize=(15,5))
plt.title('At what time people using yellow cab?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[412]:
In [430]:
date_without_outlier= agg_date[(agg_date.index!=dt.date(2016,1,23))&(agg_date.index!=dt.date(2016,1,24))&(agg_date.index!=dt.date(2015,12,25))]
In [438]:
import numpy as np
In [469]:
a=date_without_outlier.sort(columns=['count'])
b=np.random.normal(loc=a.mean(),scale=a.std(),size=len(a))
b=sorted(b)
plt.plot(a,b)
plt.plot([1500,4000],[1500,4000])
Out[469]:
In [499]:
date_without_outlier.hist(bins=30)
plt.title('Histogram of trip counts from July 2015 to June 2016',fontsize=18)
Out[499]:
In [477]:
import scipy.stats as stats
In [502]:
stats.normaltest(date_without_outlier)
Out[502]:
In [573]:
#sns.boxplot(agg_date['count'])
plt.figure(figsize=(15,5))
r = sns.boxplot(agg_date['count'],fliersize=1)
sns.swarmplot(agg_date['count'],color='indianred')
plt.title("Which are outliers?",fontsize=20)
Out[573]:
In [647]:
agg_date.plot(figsize=(15,5))
plt.plot([dt.date(2015,7,1),dt.date(2016,6,30)],[3471,3471],label='upper bound')
plt.plot([dt.date(2015,7,1),dt.date(2016,6,30)],[1581,1581],label='lower bound')
plt.title('Which days are outliers?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
plt.legend()
Out[647]:
In [593]:
top_points = r["fliers"][0].get_data()[1]
bottom_points = r["fliers"][2].get_data()[1]
plt.plot(np.ones(len(top_points)), top_points, "+")
plt.plot(np.ones(len(bottom_points)), bottom_points, "+")
In [590]:
r=agg_date.boxplot()
In [592]:
r['fliers'][0].get_data()[1]
Out[592]:
In [ ]:
In [597]:
line_C.plot('time','count',label='C',figsize=(15,5))
plt.title('Why Jan 10 11AM, unusual time for C line?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[597]:
In [596]:
line_C.plot('time','count',label='C',figsize=(15,5))
plt.title('Which day got the peak?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('time',fontsize=15)
plt.xlim('2016-01-10','2016-01-11')
Out[596]:
In [764]:
line_C.groupby('date').sum()['count'].plot(figsize=(15,5),label='C')
plt.legend()
plt.title('line C seems to be quite normal ,except for Jan 23', fontsize =20)
Out[764]:
In [766]:
line_1.groupby('date').sum()['count'].plot(figsize=(15,5),label='1')
plt.legend()
plt.title('line 1 got a peak on Feb 20,2016 ', fontsize =20)
Out[766]:
In [768]:
line_1.groupby('date').sum()['count'].iloc[220:250].plot(figsize=(15,5),label='1')
plt.title('Zoom in anomaly day in line 1', fontsize= 20)
plt.legend()
Out[768]:
In [331]:
line_1.plot('time','count',label='1',figsize=(15,5))
plt.title('Which day got the peak?',fontsize=20)
plt.ylabel('trip_counts',fontsize=15)
plt.xlabel('date',fontsize=15)
Out[331]:
In [475]:
agg_time.hist(bins=50) ## so it is not normal distribution
Out[475]:
In [668]:
line_date={}
for i in line.keys():
line_date[i] = line[i].groupby('date').sum()['count']
In [761]:
line_date['1'].hist(bins=30,label='1',alpha=0.5)
line_date['C'].hist(bins=30,label='C',alpha=0.5)
plt.legend()
plt.title('Is difference of line 1 and line C statistically significant?',fontsize=15)
Out[761]:
In [737]:
from scipy.stats import ttest_ind
In [739]:
ttest_ind(line_date['1'],line_date['C'])
Out[739]:
In [755]:
ttest_ind(line_date['1'],line_date['R'])
Out[755]:
In [756]:
ttest_ind(line_date['C'],line_date['R'])
Out[756]:
In [669]:
for i in line_date.keys():
line_date[i].plot()
In [179]:
from scipy.stats import pearsonr
In [709]:
line_date['3'].plot(figsize=(15,5),label='3')
line_date['2'].plot(figsize=(15,5),label='2')
plt.legend()
plt.title('line 3 and line 2 are highly alike')
Out[709]:
In [710]:
line_date['J'].plot(figsize=(15,5),label='J')
line_date['Z'].plot(figsize=(15,5),label='Z')
plt.legend()
plt.title('line J and line Z are highly alike')
Out[710]:
In [717]:
### pvalue <0.5, coefficient >0.9, highly correlated
for i in range(len(line_date.keys())-1):
for j in range(i+1,len(line_date.keys())):
if line_date[line_date.keys()[i]].shape == line_date[line_date.keys()[j]].shape:
if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[1]<0.05:
if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[0]>0.9:
print line_date.keys()[i],line_date.keys()[j]
print pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])
In [711]:
### pvalue <0.5, coefficient <0.2, they are correlated but loose relationship
for i in range(len(line_date.keys())-1):
for j in range(i+1,len(line_date.keys())):
if line_date[line_date.keys()[i]].shape == line_date[line_date.keys()[j]].shape:
if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[1]<0.05:
if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[0]<0.2:
print line_date.keys()[i],line_date.keys()[j]
print pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])
In [718]:
### pvalue >0.5 , they are not correlated
for i in range(len(line_date.keys())-1):
for j in range(i+1,len(line_date.keys())):
if line_date[line_date.keys()[i]].shape == line_date[line_date.keys()[j]].shape:
if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[1]>0.05:
#if pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])[0]<0.2:
print line_date.keys()[i],line_date.keys()[j]
print pearsonr(line_date[line_date.keys()[i]],line_date[line_date.keys()[j]])
In [ ]: