In [1]:
import pandas as pd
import datetime
df = pd.read_csv('201707-citibike-tripdata.csv')
df.columns = ['tripduration','starttime','stoptime',\
'start_station_id','start_station_name','start_station_latitude','start_station_longitude',\
'end_station_id','end_station_name','end_station_latitude','end_station_longitude',\
'bikeid','usertype','birth_year','gender']
In [2]:
from sqlalchemy import create_engine
engine = create_engine('mysql://calee0219:110010@localhost/citybike')
In [3]:
print(df.isnull().sum().sum())
print(pd.isnull(df).sum() > 0)
birth_mean = df['birth_year'].mean()
df = df.fillna(birth_mean)
In [4]:
df = df.drop(df.index[df['starttime'] >= df['stoptime']])
df = df.reset_index(drop=True)
In [5]:
import datetime
import operator
from pyproj import Geod
wgs84_geod = Geod(ellps='WGS84')
start = [datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') for dt in df['starttime'].tolist()]
end = [datetime.datetime.strptime(dt, '%Y-%m-%d %H:%M:%S') for dt in df['stoptime'].tolist()]
def Distance(lat1,lon1,lat2,lon2):
az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
return dist
dist = Distance(df['start_station_latitude'].tolist(), df['start_station_longitude'].tolist(), \
df['end_station_latitude'].tolist(), df['end_station_longitude'].tolist())
speed = list(map(operator.truediv, [x/1000 for x in dist], [ time.seconds/3600 for time in list(map(operator.sub, end, start))]))
zp = list(zip(speed,list(range(df.shape[0]))))
zp.sort()
zp.reverse()
for i in zp[:6]:
print(i)
df = df.drop(df.index[[716622,320615,1393557,1260345]])
df.reset_index(drop=True, inplace=True)
In [6]:
from sqlalchemy import types
try:
df = pd.read_sql_table(table_name='origin', con=engine)
except:
df['tripduration'].astype(int)
df['starttime'] = pd.to_datetime(df['starttime'])
df['stoptime'] = pd.to_datetime(df['stoptime'])
df['start_station_id'].astype(int)
df['start_station_name'].astype(str)
df['start_station_latitude'].astype(float)
df['start_station_longitude'].astype(float)
df['end_station_id'].astype(int)
df['end_station_name'].astype(str)
df['end_station_latitude'].astype(float)
df['end_station_longitude'].astype(float)
df['bikeid'].astype(int)
df['usertype'].astype(str)
df['birth_year'].astype(int)
df['gender'].astype(int)
df.to_sql(name='origin', con=engine, if_exists='replace',index=False,\
dtype={'starttime': types.DATETIME, 'stoptime': types.DATETIME, 'birth_year': types.BIGINT})
In [7]:
try:
station = pd.read_sql_table(table_name='station', con=engine)
except:
station = pd.DataFrame(df[['start_station_id', 'start_station_name', 'start_station_latitude', 'start_station_longitude']])
station.columns = ['id', 'name', 'latitude', 'longitude']
tmp = pd.DataFrame(df[['end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude']])
tmp.columns = ['id', 'name', 'latitude', 'longitude']
station = pd.concat([station, tmp])
station = station.sort_values('id').drop_duplicates().reset_index(drop=True)
station.to_sql(name='station', con=engine, if_exists='fail',index=False)
In [8]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
my_map = Basemap(projection='merc', lat_0=40.7, lon_0=-73.98,
resolution = 'h', area_thresh = 0.01,
llcrnrlon=-74.1, llcrnrlat=40.64,
urcrnrlon=-73.9, urcrnrlat=40.85)
lon = station['longitude'].tolist()
lat = station['latitude'].tolist()
labels = station['id'].tolist()
fig = plt.figure(frameon=False)
fig.set_size_inches(18,12)
my_map.drawcoastlines()
my_map.drawcountries()
my_map.fillcontinents(color='coral')
my_map.drawmapboundary()
x,y = my_map(lon, lat)
my_map.plot(x, y, 'bo', markersize=2)
plt.show()
In [9]:
from sqlalchemy import types
try:
path = pd.read_sql_table(table_name='path', con=engine)
except:
path = df.drop(['start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_name', 'end_station_latitude', 'end_station_longitude'], axis=1)
path.to_csv('path.csv', index=False)
path.to_sql(name='path', con=engine, if_exists='fail',index=False,\
dtype={'starttime': types.DATETIME, 'stoptime': types.DATETIME, 'birth_year': types.BIGINT})
In [10]:
import bisect
import datetime
try:
in_out = pd.read_sql_table(table_name='in_out', con=engine)
except:
begin = datetime.datetime(2017, 7, 1, 0, 0, 0)
end = datetime.datetime(2017, 8, 1, 23, 30, 0)
date_list = [ end - datetime.timedelta(seconds=x*60*30) for x in range(0, 1536)][::-1]
table = {}
for idx, row in path.iterrows():
start_date = row['starttime']
start = date_list[bisect.bisect_right(date_list, start_date)]
end_date = row['stoptime']
end = date_list[bisect.bisect_right(date_list, end_date)]
start_tmp = (row['start_station_id'], start)
if table.get(start_tmp) == None:
table[start_tmp] = (1,0)
else:
tmp = list(table[start_tmp])
tmp[0] += 1
table[start_tmp] = tuple(tmp)
stop_tmp = (row['end_station_id'], start)
if table.get(stop_tmp) == None:
table[stop_tmp] = (0,1)
else:
tmp = list(table[stop_tmp])
tmp[1] += 1
table[stop_tmp] = tuple(tmp)
tmp_in_out = []
for key in table.keys():
tmp_in_out.append([key[0], key[1], table[key][0], table[key][1]])
in_out = pd.DataFrame(tmp_in_out, columns=['id', 'time', 'in', 'out'])
in_out.to_sql(name='in_out', con=engine, if_exists='replace',index=False,\
dtype={'time': types.DATETIME})
In [11]:
import pandas as pd
from mlxtend.preprocessing import OnehotTransactions
from mlxtend.frequent_patterns import apriori
transactions = []
for idx, row in in_out.iterrows():
if row['id'] == 519:
transactions.append([('in',row['in']//10), ('out',row['out']//10)])
min_sup = 0.01
oht = OnehotTransactions()
oht_ary = oht.fit(transactions).transform(transactions)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
frequent_itemsets = apriori(df, min_support=min_sup, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
fqs = frequent_itemsets[ (frequent_itemsets['length'] >= 2) &
(frequent_itemsets['support'] >= min_sup) ].sort_values(['support'], ascending=False)
print(fqs)
for idx, row in fqs.iterrows():
cof = row['itemsets'][0]
In [12]:
import Orange
from orangecontrib.associate.fpgrowth import *
transactions = []
for idx, row in in_out.iterrows():
if row['id'] == 519:
transactions.append([('in',row['in']//10), ('out',row['out']//10)])
#transactions = np.array(transactions)
import pyfpgrowth
patterns = pyfpgrowth.find_frequent_patterns(transactions, 10)
print(patterns)
rules = pyfpgrowth.generate_association_rules(patterns, 0.3)
for key in rules.keys():
print(key, rules[key])
In [23]:
query = "SELECT in_out.id, in_out.time, in_out.in, in_out.out, T.latitude, T.longitude FROM in_out left join ( SELECT id, latitude, longitude from station )T ON T.id = in_out.id ORDER BY id"
table = pd.read_sql_query(query, engine)
lat_mean = station['latitude'].mean()
lon_mean = station['longitude'].mean()
#print(lat_mean, lon_mean)
def Distance(lat1,lon1,lat2,lon2):
az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
return dist
from orangecontrib.associate.fpgrowth import *
rem = {}
for idx, row in station.iterrows():
rem[row['id']] = Distance(lat_mean, lon_mean, row['latitude'], row['longitude'])//1000 # 以公里為單位
from fp_growth import *
transactions = []
for idx, row in table.iterrows():
rin = row['in'] // 10
rout = row['out'] // 10
if rin == 0 or rout == 0: continue
transactions.append([(rem[row['id']], row['time'].time().isoformat()), ('in',rin), ('out',rout)])
result = {}
for itemset, support in find_frequent_itemsets(transactions, .02*len(transactions), True):
result[tuple(itemset)] = support/len(transactions)
def subs(l):
assert type(l) is list
if len(l) == 1:
return [l]
x = subs(l[1:])
return x + [[l[0]] + y for y in x]
def assRule(freq, min_conf = 0.6):
assert type(freq) is dict
result = []
for item, sup in freq.items():
for subitem in subs(list(item)):
sb = [x for x in item if x not in subitem]
if sb == [] or subitem == []: continue
if len(subitem) == 1 and (subitem[0][0] == 'in' or subitem[0][0] == 'out'):
continue
conf = sup/freq[tuple(subitem)]
if conf >= min_conf:
result.append({'from':subitem, 'to':sb, 'sup':sup, 'conf':conf})
return result
rules = assRule(result, 0.8)
#print(rules)
for ru in rules:
print(ru)
In [52]:
query = '''
SELECT in_out.id, in_out.time, in_out.in, in_out.out, T1.st_time, T2.en_time
FROM in_out
LEFT JOIN (
SELECT start_station_id AS st_id, SEC_TO_TIME(AVG(TIME_TO_SEC(DATE_FORMAT(starttime, "%%H:%%i:%%s")))) AS st_time
FROM path
GROUP BY start_station_id
)T1 ON in_out.id = T1.st_id
LEFT JOIN (
SELECT end_station_id AS en_id, SEC_TO_TIME(AVG(TIME_TO_SEC(DATE_FORMAT(stoptime, "%%H:%%i:%%s")))) AS en_time
FROM path
GROUP BY end_station_id
)T2 ON in_out.id = T2.en_id
ORDER BY in_out.id;
'''
table = pd.read_sql_query(query, engine)
transactions = []
for idx, row in table.iterrows():
rin = row['in'] // 10
rout = row['out'] // 10
if rin == 0 or rout == 0: continue
st = (datetime.datetime.min+row['st_time']).time().replace(second=0, microsecond=0)
st = st.replace(minute=st.minute//10 * 10).isoformat()
en = (datetime.datetime.min+row['en_time']).time().replace(second=0, microsecond=0)
en = en.replace(minute=en.minute//10 * 10).isoformat()
transactions.append([('stime', st), ('etime', en), ('in',rin), ('out',rout)])
result = {}
for itemset, support in find_frequent_itemsets(transactions, .04*len(transactions), True):
result[tuple(itemset)] = support/len(transactions)
def subs(l):
assert type(l) is list
if len(l) == 1:
return [l]
x = subs(l[1:])
return x + [[l[0]] + y for y in x]
def assRule(freq, min_conf = 0.6):
assert type(freq) is dict
result = []
for item, sup in freq.items():
for subitem in subs(list(item)):
sb = [x for x in item if x not in subitem]
if sb == [] or subitem == []: continue
if len(subitem) == 1 and (subitem[0][0] == 'in' or subitem[0][0] == 'out'):
continue
conf = sup/freq[tuple(subitem)]
if conf >= min_conf:
result.append({'from':subitem, 'to':sb, 'sup':sup, 'conf':conf})
return result
rules = assRule(result, 0.9)
#print(rules)
for ru in rules:
print(ru)