In [23]:
import urllib2, urllib
import json
def get_businfo(busno):
parameters = {}
parameters['strSrch'] = busno
target = 'http://m.bus.go.kr/mBus/bus/getBusRouteList.bms'
parameters = urllib.urlencode(parameters)
while True:
handler = urllib2.urlopen(target, parameters)
if handler.code < 400:
f = handler.read()
j = json.loads(f.decode('cp949'))
try:
return j["resultList"][0]["busRouteId"]
except:
return None
def get_busroute(busno):
routeid = get_businfo(busno)
if not routeid:
return
parameters = {}
parameters['busRouteId'] = routeid
target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
parameters = urllib.urlencode(parameters)
while True:
handler = urllib2.urlopen(target, parameters)
if handler.code < 400:
f = handler.read()
j = json.loads(f.decode('cp949'))
routes = j["resultList"]
for route in routes:
busRouteNm = route['busRouteNm']
busRouteId = route['busRouteId']
stationNm = route['stationNm']
stationNo = route['stationNo']
x = route['gpsX']
y = route['gpsY']
l = '\t'.join([busRouteNm, busRouteId, stationNo, stationNm, x, y])
# l = '\t'.join(list(route.values()))
print(l.encode('utf-8'))
break
def get_busroute_keys(busno):
routeid = get_businfo(busno)
parameters = {}
parameters['busRouteId'] = routeid
target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
parameters = urllib.urlencode(parameters)
while True:
handler = urllib2.urlopen(target, parameters)
if handler.code < 400:
f = handler.read()
j = json.loads(f.decode('cp949'))
route = j["resultList"][0]
print(','.join(list(route.keys())))
break
def main():
# get_busroute_keys('0017')
# print('busno, busid, stationid, stationnm, x, y')
busnos = ['0017', '0018', '1014', '1017', '1020', '1111', '1113', '1114', '1115', '1117', '1119', '1120', '1122', '1124', '1126', '1127', '1128', '1129', '1130', '1131', '1132', '1133', '1135', '1136', '1137', '1138', '1139', '1140', '1141', '1142', '1143', '1144', '1146', '1152', '1154', '1155', '1156', '1157', '1161', '1162', '1164', '1165', '1166', '1212', '1213', '1215', '1218', '1221', '1222', '1224', '1225', '1226', '1227', '1711', '2012', '2013', '2014', '2015', '2016', '2112', '2113', '2114', '2211', '2220', '2221', '2222', '2223', '2224', '2227', '2230', '2233', '2234', '2235', '2411', '2412', '2413', '2415', '3011', '3212', '3214', '3215', '3216', '3217', '3219', '3220', '3313', '3314', '3315', '3316', '3317', '3318', '3319', '3411', '3412', '3413', '3414', '3416', '3417', '3418', '3422', '3423', '4212', '4318', '4319', '4412', '4419', '4425', '4426', '4429', '4430', '4431', '4432', '4433', '4434', '5012', '5413', '5511', '5513', '5515', '5516', '5517', '5519', '5523', '5524', '5525', '5526', '5528', '5530', '5531', '5534', '5535', '5536', '5537', '5538', '5612', '5615', '5616', '5617', '5618', '5619', '5620', '5621', '5623', '5624', '5625', '5626', '5627', '5630', '5633', '5712', '5713', '5714', '6211', '6411', '6511', '6512', '6513', '6514', '6515', '6611', '6613', '6614', '6616', '6617', '6618', '6620', '6623', '6624', '6625', '6627', '6628', '6629', '6630', '6631', '6632', '6635', '6637', '6638', '6640', '6641', '6642', '6643', '6645', '6646', '6647', '6648', '6649', '6650', '6651', '6653', '6654', '6657', '6712', '6714', '6715', '6716', '7011', '7013A', '7013B', '7016', '7017', '7018', '7019', '7021', '7022', '7024', '7025', '7211', '7212', '7611', '7612', '7613', '7711', '7713', '7715', '7719', '7720', '7722', '7723', '7726', '7727', '7728', '7730', '7733', '7737', '7738']
for busno in busnos:
get_busroute(busno)
# if __name__ == '__main__':
# main()
In [1]:
from string import Template
colors = ["FF0000", "00FF00", "0000FF", "FFFF00", "FF00FF", "00FFFF", "000000",
"800000", "008000", "000080", "808000", "800080", "008080", "808080",
"C00000", "00C000", "0000C0", "C0C000", "C000C0", "00C0C0", "C0C0C0",
"400000", "004000", "000040", "404000", "400040", "004040", "404040",
"200000", "002000", "000020", "202000", "200020", "002020", "202020",
"600000", "006000", "000060", "606000", "600060", "006060", "606060",
"A00000", "00A000", "0000A0", "A0A000", "A000A0", "00A0A0", "A0A0A0",
"E00000", "00E000", "0000E0", "E0E000", "E000E0", "00E0E0", "E0E0E0"]
def generate_busstops(df):
coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
return 'var busstops = [{0}];'.format(coos)
def generate_polyline(valname, df, idx=0):
coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
valcoos = 'var {0} = [{1}];'.format(valname, coos)
polyline_template = """
{0}
var {1}_ = new google.maps.Polyline({{
path: {1},
strokeColor: "#{2}",
strokeOpacity: 0.8,
strokeWeight: 3
}});
{1}_.setMap(map);"""
return polyline_template.format(valcoos, valname, colors[idx])
# data = {'busstops': busstops, 'busroutes':busroutes}
def generate_template_html(data, outfilename='busmap.html'):
infile = open('map_temp.html')
template = Template(infile.read())
map_html = template.substitute(data)
outfile = open(outfilename, 'w')
outfile.write(map_html)
In [12]:
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
# mbus_df.shape
sampling_mbus = mbus_df.loc[np.random.permutation(mbus_df.index)[:200]]
In [13]:
# 일부 정거장 구하기
busstops = generate_busstops(sampling_mbus)
data = {'busstops': busstops, 'busroutes': ''}
generate_template_html(data, 'all_busstop')
In [5]:
N_BUSSTOPS = 8
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df
In [101]:
busstops = generate_busstops(busstop_df)
busroutes = ''
i = 0
for idx, row in busstop_df.iterrows():
route = bus_df[bus_df['busno']==row['busno']]
busroutes += generate_polyline('poly' + str(i), route, i)
i += 1
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data)
In [7]:
#1 버스 정거장에 다니는 버스
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
mbus_df = mbus_df[mbus_df['stationid']=='24138']
station_df = mbus_df.drop_duplicates(cols='busno', take_last=True)
In [8]:
#2 각 버스의 노선
busstops = generate_busstops(station_df.drop_duplicates(cols='stationid', take_last=True))
print(busstops)
busroutes = ''
i = 0
for idx, row in station_df.iterrows():
route = bus_df[bus_df['busno']==row['busno']]
# print(route)
busroutes += generate_polyline('poly' + str(i), route, i)
i += 1
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'bus.html')
In [8]:
#1 버스 정거장 선택
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
N_BUSSTOPS = 7
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
busstop_df
Out[8]:
In [16]:
#2 시각화
busstops = generate_busstops(busstop_df)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
# route = bus_df[bus_df['busno']==row['busno']]
# busroutes += generate_polyline('poly' + str(i), route, i)
# i += 1
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_origin.html')
In [17]:
#3 클러스터링
from sklearn.cluster import KMeans
from sklearn import metrics
print(busstop_df.head())
X = busstop_df[['x', 'y']]
y = busstop_df['busno']
cluster_range = range(2, 15)
vmeasures = []
for n_cluster in cluster_range:
# km = KMeans(n_clusters=7, init='random', max_iter=100, n_init=1, verbose=1)
km = KMeans(init='k-means++', n_clusters=n_cluster, n_init=10)
# print "Clustering sparse data with %s" % km
km.fit(X)
# print '----------------------------------------------------'
# print n_cluster
# print "Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_)
# print "Completeness: %0.3f" % metrics.completeness_score(y, km.labels_)
# print "V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_)
# print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(y, km.labels_)
vmeasures.append(metrics.v_measure_score(y, km.labels_))
# vmeasures.append(metrics.silhouette_score(X, km.labels_, metric='euclidean'))
import matplotlib.pyplot as plt
plt.plot(cluster_range, vmeasures)
plt.xlabel('# cluster')
plt.ylabel('v measure')
plt.autoscale(tight=True)
plt.grid()
plt.show()
In [20]:
#4 클러스터링 후 시각화
best_clusters = 7
km = KMeans(init='k-means++', n_clusters=best_clusters, n_init=10)
km.fit(X)
# print km.cluster_centers_
centers = pd.DataFrame(km.cluster_centers_, columns=['x', 'y'])
busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
# route = bus_df[bus_df['busno']==row['busno']]
# busroutes += generate_polyline('poly' + str(i), route, i)
# i += 1
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')
In [15]:
#1. DBSCAN로 이상치 찾기
from collections import namedtuple
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.cluster import DBSCAN
X['sx'] = X.x
X['sy'] = X.y
ss = StandardScaler()
X['sx'] = ss.fit_transform(X.sx)
X['sy'] = ss.fit_transform(X.sy)
# print(X)
Param = namedtuple('Param', ['eps', 'min_samples'])
params = [Param(0.45, 2), Param(0.30, 2), Param(0.35, 2), Param(0.40, 2),
Param(0.45, 4), Param(0.30, 4), Param(0.35, 4), Param(0.40, 4),
Param(0.45, 3), Param(0.30, 3), Param(0.35, 3), Param(0.40, 3)]
# print(X.values)
for param in params:
dbscan = DBSCAN(eps=param.eps, min_samples=param.min_samples).fit(X[['sx', 'sy']].values)
labels = dbscan.labels_
outliers = X[labels == -1]
print(param)
# print(labels)
print(outliers)
# print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
print(metrics.silhouette_score(X, labels, metric='euclidean'))
# busstops = generate_busstops(outliers)
# busroutes = ''
# data = {'busstops': busstops, 'busroutes': busroutes}
# generate_template_html(data, 'busstop_cluster_outlier.html')
In [22]:
#2 시각화
best_param = Param(eps=0.3, min_samples=3)
dbscan = DBSCAN(eps=best_param.eps, min_samples=best_param.min_samples).fit(X[['sx', 'sy']].values)
labels = dbscan.labels_
outliers = X[labels == -1]
busstops = generate_busstops(outliers)
busroutes = ''
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_outlier.html')
In [18]:
#1 mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
N_BUSSTOPS = 7
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df
X = busstop_df[['x', 'y']]
y = busstop_df['busno']
print(X.shape)
bandwidth = estimate_bandwidth(X.values, quantile=0.2, n_samples=30)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X.values)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
print(ms.cluster_centers_)
In [ ]:
#2 클러스터링 후 시각화
centers = pd.DataFrame(ms.cluster_centers_, columns=['x', 'y'])
busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
# route = bus_df[bus_df['busno']==row['busno']]
# busroutes += generate_polyline('poly' + str(i), route, i)
# i += 1
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')