In [23]:
    
import urllib2, urllib
import json
def get_businfo(busno):
    parameters = {}
    parameters['strSrch'] = busno
    target = 'http://m.bus.go.kr/mBus/bus/getBusRouteList.bms'
    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            try:
                return j["resultList"][0]["busRouteId"]
            except:
                return None
def get_busroute(busno):
    routeid = get_businfo(busno)
    if not routeid:
        return 
    parameters = {}
    parameters['busRouteId'] = routeid
    target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
    
    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            routes = j["resultList"]
            for route in routes:
                busRouteNm = route['busRouteNm']
                busRouteId = route['busRouteId']
                stationNm = route['stationNm']
                stationNo = route['stationNo']
                x = route['gpsX']
                y = route['gpsY']
                l = '\t'.join([busRouteNm, busRouteId, stationNo, stationNm, x, y])
                # l = '\t'.join(list(route.values()))
                print(l.encode('utf-8'))
            break
def get_busroute_keys(busno):
    routeid = get_businfo(busno)
    parameters = {}
    parameters['busRouteId'] = routeid
    target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
    
    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            route = j["resultList"][0]
            print(','.join(list(route.keys())))
            break
            
def main():
#     get_busroute_keys('0017')
#     print('busno, busid, stationid, stationnm, x, y')
    busnos = ['0017', '0018', '1014', '1017', '1020', '1111', '1113', '1114', '1115', '1117', '1119', '1120', '1122', '1124', '1126', '1127', '1128', '1129', '1130', '1131', '1132', '1133', '1135', '1136', '1137', '1138', '1139', '1140',  '1141',  '1142', '1143', '1144', '1146', '1152', '1154', '1155', '1156', '1157', '1161', '1162', '1164', '1165', '1166', '1212', '1213', '1215', '1218', '1221', '1222', '1224', '1225', '1226', '1227', '1711', '2012', '2013', '2014', '2015', '2016', '2112', '2113', '2114', '2211', '2220', '2221', '2222', '2223', '2224', '2227', '2230', '2233', '2234', '2235', '2411', '2412', '2413', '2415', '3011', '3212', '3214', '3215', '3216', '3217', '3219', '3220', '3313', '3314', '3315', '3316', '3317', '3318', '3319', '3411', '3412', '3413', '3414', '3416', '3417', '3418', '3422', '3423', '4212', '4318', '4319', '4412', '4419', '4425', '4426', '4429', '4430', '4431', '4432', '4433', '4434', '5012', '5413', '5511', '5513', '5515', '5516', '5517', '5519', '5523', '5524', '5525', '5526', '5528', '5530', '5531', '5534', '5535', '5536', '5537', '5538', '5612', '5615', '5616', '5617', '5618', '5619', '5620', '5621', '5623', '5624', '5625', '5626', '5627', '5630', '5633', '5712', '5713', '5714', '6211', '6411', '6511', '6512', '6513', '6514', '6515', '6611', '6613', '6614', '6616', '6617', '6618', '6620', '6623', '6624', '6625', '6627', '6628', '6629', '6630', '6631', '6632', '6635', '6637', '6638', '6640', '6641', '6642', '6643', '6645', '6646', '6647', '6648', '6649', '6650', '6651', '6653', '6654', '6657', '6712', '6714', '6715', '6716', '7011', '7013A', '7013B', '7016', '7017', '7018', '7019', '7021', '7022', '7024', '7025', '7211', '7212', '7611', '7612', '7613', '7711', '7713', '7715', '7719', '7720', '7722', '7723', '7726', '7727', '7728', '7730', '7733', '7737', '7738']
    for busno in busnos:
        get_busroute(busno)
# if __name__ == '__main__':
#     main()
    
In [1]:
    
from string import Template
colors = ["FF0000", "00FF00", "0000FF", "FFFF00", "FF00FF", "00FFFF", "000000",
        "800000", "008000", "000080", "808000", "800080", "008080", "808080",
        "C00000", "00C000", "0000C0", "C0C000", "C000C0", "00C0C0", "C0C0C0",
        "400000", "004000", "000040", "404000", "400040", "004040", "404040",
        "200000", "002000", "000020", "202000", "200020", "002020", "202020",
        "600000", "006000", "000060", "606000", "600060", "006060", "606060",
        "A00000", "00A000", "0000A0", "A0A000", "A000A0", "00A0A0", "A0A0A0",
        "E00000", "00E000", "0000E0", "E0E000", "E000E0", "00E0E0", "E0E0E0"]
def generate_busstops(df):
    coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
    return 'var busstops = [{0}];'.format(coos)
def generate_polyline(valname, df, idx=0):
    coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
    valcoos =  'var {0} = [{1}];'.format(valname, coos)
    
    polyline_template = """
                        {0}
                        var {1}_ = new google.maps.Polyline({{
                        path: {1},
                        strokeColor: "#{2}",
                        strokeOpacity: 0.8,
                        strokeWeight: 3
                        }});
                        {1}_.setMap(map);"""
    
    return polyline_template.format(valcoos, valname, colors[idx])
    
# data = {'busstops': busstops, 'busroutes':busroutes}
def generate_template_html(data, outfilename='busmap.html'):
    infile = open('map_temp.html')
    template = Template(infile.read())
    map_html = template.substitute(data)
    outfile = open(outfilename, 'w')
    outfile.write(map_html)
    
In [12]:
    
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
# mbus_df.shape
sampling_mbus = mbus_df.loc[np.random.permutation(mbus_df.index)[:200]]
    
In [13]:
    
# 일부 정거장 구하기
busstops = generate_busstops(sampling_mbus)
    
data = {'busstops': busstops, 'busroutes': ''}
generate_template_html(data, 'all_busstop')
    
In [5]:
    
N_BUSSTOPS = 8
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df
    
In [101]:
    
busstops = generate_busstops(busstop_df)
busroutes = ''
i = 0
for idx, row in busstop_df.iterrows():
    route = bus_df[bus_df['busno']==row['busno']]
    busroutes += generate_polyline('poly' + str(i), route, i)
    i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data)
    
In [7]:
    
#1 버스 정거장에 다니는 버스
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
mbus_df = mbus_df[mbus_df['stationid']=='24138']
station_df = mbus_df.drop_duplicates(cols='busno', take_last=True)
    
In [8]:
    
#2 각 버스의 노선
busstops = generate_busstops(station_df.drop_duplicates(cols='stationid', take_last=True))
print(busstops)
busroutes = ''
i = 0
for idx, row in station_df.iterrows():
    route = bus_df[bus_df['busno']==row['busno']]
#     print(route)
    busroutes += generate_polyline('poly' + str(i), route, i)
    i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'bus.html')
    
    
In [8]:
    
#1 버스 정거장 선택
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
N_BUSSTOPS = 7
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
busstop_df
    
    Out[8]:
In [16]:
    
#2 시각화
busstops = generate_busstops(busstop_df)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_origin.html')
    
In [17]:
    
#3 클러스터링
from sklearn.cluster import KMeans
from sklearn import metrics
print(busstop_df.head())
X = busstop_df[['x', 'y']]
y = busstop_df['busno']
cluster_range = range(2, 15)
vmeasures = []
for n_cluster in cluster_range:
    # km = KMeans(n_clusters=7, init='random', max_iter=100, n_init=1, verbose=1)
    km = KMeans(init='k-means++', n_clusters=n_cluster, n_init=10)
#     print "Clustering sparse data with %s" % km
    km.fit(X)
#     print '----------------------------------------------------'
#     print n_cluster
#     print "Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_)
#     print "Completeness: %0.3f" % metrics.completeness_score(y, km.labels_)
#     print "V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_)
#     print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(y, km.labels_)
    vmeasures.append(metrics.v_measure_score(y, km.labels_))
#     vmeasures.append(metrics.silhouette_score(X, km.labels_, metric='euclidean'))
    
import matplotlib.pyplot as plt
plt.plot(cluster_range, vmeasures)
    
plt.xlabel('# cluster')
plt.ylabel('v measure')
plt.autoscale(tight=True)
plt.grid()
plt.show()
    
    
    
In [20]:
    
#4 클러스터링 후 시각화
best_clusters = 7
km = KMeans(init='k-means++', n_clusters=best_clusters, n_init=10)
km.fit(X)
# print km.cluster_centers_
centers = pd.DataFrame(km.cluster_centers_, columns=['x', 'y'])
busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')
    
In [15]:
    
#1. DBSCAN로 이상치 찾기
from collections import namedtuple
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.cluster import DBSCAN
X['sx'] = X.x
X['sy'] = X.y
ss = StandardScaler()
X['sx'] = ss.fit_transform(X.sx)
X['sy'] = ss.fit_transform(X.sy)
# print(X)
Param = namedtuple('Param', ['eps', 'min_samples'])
params = [Param(0.45, 2), Param(0.30, 2), Param(0.35, 2), Param(0.40, 2), 
            Param(0.45, 4), Param(0.30, 4), Param(0.35, 4), Param(0.40, 4), 
            Param(0.45, 3), Param(0.30, 3), Param(0.35, 3), Param(0.40, 3)]
# print(X.values)
for param in params:
    dbscan = DBSCAN(eps=param.eps, min_samples=param.min_samples).fit(X[['sx', 'sy']].values)
    labels = dbscan.labels_
    outliers = X[labels == -1]
    
    print(param)
#     print(labels)
    print(outliers)
#     print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
    print(metrics.silhouette_score(X, labels, metric='euclidean'))
# busstops = generate_busstops(outliers)
# busroutes = ''
    
# data = {'busstops': busstops, 'busroutes': busroutes}
# generate_template_html(data, 'busstop_cluster_outlier.html')
    
    
    
In [22]:
    
#2 시각화
best_param = Param(eps=0.3, min_samples=3)
dbscan = DBSCAN(eps=best_param.eps, min_samples=best_param.min_samples).fit(X[['sx', 'sy']].values)
labels = dbscan.labels_
outliers = X[labels == -1]
busstops = generate_busstops(outliers)
busroutes = ''
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_outlier.html')
    
In [18]:
    
#1 mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth
import pandas as pd
bus_df = pd.read_csv('bus.tsv', sep='\t')
mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
N_BUSSTOPS = 7
stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df
X = busstop_df[['x', 'y']]
y = busstop_df['busno']
print(X.shape)
bandwidth = estimate_bandwidth(X.values, quantile=0.2, n_samples=30)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X.values)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
print(ms.cluster_centers_)
    
    
In [ ]:
    
#2 클러스터링 후 시각화
centers = pd.DataFrame(ms.cluster_centers_, columns=['x', 'y'])
busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')