버스 정보 가져오기



In [23]:

    
import urllib2, urllib
import json

def get_businfo(busno):
    parameters = {}
    parameters['strSrch'] = busno
    target = 'http://m.bus.go.kr/mBus/bus/getBusRouteList.bms'

    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            try:
                return j["resultList"][0]["busRouteId"]
            except:
                return None

def get_busroute(busno):
    routeid = get_businfo(busno)
    if not routeid:
        return 
    parameters = {}
    parameters['busRouteId'] = routeid
    target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
    
    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            routes = j["resultList"]
            for route in routes:
                busRouteNm = route['busRouteNm']
                busRouteId = route['busRouteId']
                stationNm = route['stationNm']
                stationNo = route['stationNo']
                x = route['gpsX']
                y = route['gpsY']
                l = '\t'.join([busRouteNm, busRouteId, stationNo, stationNm, x, y])
                # l = '\t'.join(list(route.values()))
                print(l.encode('utf-8'))
            break

def get_busroute_keys(busno):
    routeid = get_businfo(busno)
    parameters = {}
    parameters['busRouteId'] = routeid
    target = 'http://m.bus.go.kr/mBus/bus/getRouteAndPos.bms'
    
    parameters = urllib.urlencode(parameters)
    
    while True:
        handler = urllib2.urlopen(target, parameters)
        if handler.code < 400:
            f = handler.read()
            j = json.loads(f.decode('cp949'))
            route = j["resultList"][0]
            print(','.join(list(route.keys())))
            break
            
def main():
#     get_busroute_keys('0017')
#     print('busno, busid, stationid, stationnm, x, y')
    busnos = ['0017', '0018', '1014', '1017', '1020', '1111', '1113', '1114', '1115', '1117', '1119', '1120', '1122', '1124', '1126', '1127', '1128', '1129', '1130', '1131', '1132', '1133', '1135', '1136', '1137', '1138', '1139', '1140',  '1141',  '1142', '1143', '1144', '1146', '1152', '1154', '1155', '1156', '1157', '1161', '1162', '1164', '1165', '1166', '1212', '1213', '1215', '1218', '1221', '1222', '1224', '1225', '1226', '1227', '1711', '2012', '2013', '2014', '2015', '2016', '2112', '2113', '2114', '2211', '2220', '2221', '2222', '2223', '2224', '2227', '2230', '2233', '2234', '2235', '2411', '2412', '2413', '2415', '3011', '3212', '3214', '3215', '3216', '3217', '3219', '3220', '3313', '3314', '3315', '3316', '3317', '3318', '3319', '3411', '3412', '3413', '3414', '3416', '3417', '3418', '3422', '3423', '4212', '4318', '4319', '4412', '4419', '4425', '4426', '4429', '4430', '4431', '4432', '4433', '4434', '5012', '5413', '5511', '5513', '5515', '5516', '5517', '5519', '5523', '5524', '5525', '5526', '5528', '5530', '5531', '5534', '5535', '5536', '5537', '5538', '5612', '5615', '5616', '5617', '5618', '5619', '5620', '5621', '5623', '5624', '5625', '5626', '5627', '5630', '5633', '5712', '5713', '5714', '6211', '6411', '6511', '6512', '6513', '6514', '6515', '6611', '6613', '6614', '6616', '6617', '6618', '6620', '6623', '6624', '6625', '6627', '6628', '6629', '6630', '6631', '6632', '6635', '6637', '6638', '6640', '6641', '6642', '6643', '6645', '6646', '6647', '6648', '6649', '6650', '6651', '6653', '6654', '6657', '6712', '6714', '6715', '6716', '7011', '7013A', '7013B', '7016', '7017', '7018', '7019', '7021', '7022', '7024', '7025', '7211', '7212', '7611', '7612', '7613', '7711', '7713', '7715', '7719', '7720', '7722', '7723', '7726', '7727', '7728', '7730', '7733', '7737', '7738']
    for busno in busnos:
        get_busroute(busno)

# if __name__ == '__main__':
#     main()

google map 파일 출력



In [1]:

    
from string import Template

colors = ["FF0000", "00FF00", "0000FF", "FFFF00", "FF00FF", "00FFFF", "000000",
        "800000", "008000", "000080", "808000", "800080", "008080", "808080",
        "C00000", "00C000", "0000C0", "C0C000", "C000C0", "00C0C0", "C0C0C0",
        "400000", "004000", "000040", "404000", "400040", "004040", "404040",
        "200000", "002000", "000020", "202000", "200020", "002020", "202020",
        "600000", "006000", "000060", "606000", "600060", "006060", "606060",
        "A00000", "00A000", "0000A0", "A0A000", "A000A0", "00A0A0", "A0A0A0",
        "E00000", "00E000", "0000E0", "E0E000", "E000E0", "00E0E0", "E0E0E0"]

def generate_busstops(df):
    coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
    return 'var busstops = [{0}];'.format(coos)

def generate_polyline(valname, df, idx=0):
    coos = ',\n'.join(['new google.maps.LatLng(%s, %s)' %(r['y'] ,r['x']) for i, r in df.iterrows()])
    valcoos =  'var {0} = [{1}];'.format(valname, coos)
    
    polyline_template = """
                        {0}
                        var {1}_ = new google.maps.Polyline({{
                        path: {1},
                        strokeColor: "#{2}",
                        strokeOpacity: 0.8,
                        strokeWeight: 3
                        }});
                        {1}_.setMap(map);"""
    
    return polyline_template.format(valcoos, valname, colors[idx])
    
# data = {'busstops': busstops, 'busroutes':busroutes}
def generate_template_html(data, outfilename='busmap.html'):
    infile = open('map_temp.html')
    template = Template(infile.read())
    map_html = template.substitute(data)
    outfile = open(outfilename, 'w')
    outfile.write(map_html)

버스 정보 읽기와 필터링



In [12]:

    
import pandas as pd

bus_df = pd.read_csv('bus.tsv', sep='\t')

mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']
# mbus_df.shape
sampling_mbus = mbus_df.loc[np.random.permutation(mbus_df.index)[:200]]



In [13]:

    
# 일부 정거장 구하기
busstops = generate_busstops(sampling_mbus)
    
data = {'busstops': busstops, 'busroutes': ''}
generate_template_html(data, 'all_busstop')



In [5]:

    
N_BUSSTOPS = 8

stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df

필터링 한 버스 정거장 map으로 출력



In [101]:

    
busstops = generate_busstops(busstop_df)
busroutes = ''
i = 0
for idx, row in busstop_df.iterrows():
    route = bus_df[bus_df['busno']==row['busno']]
    busroutes += generate_polyline('poly' + str(i), route, i)
    i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data)

24138 버스 정거장에 다니는 버스

버스 정거장에 다니는 버스
각 버스의 노선



In [7]:

    
#1 버스 정거장에 다니는 버스

import pandas as pd

bus_df = pd.read_csv('bus.tsv', sep='\t')

mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']

mbus_df = mbus_df[mbus_df['stationid']=='24138']
station_df = mbus_df.drop_duplicates(cols='busno', take_last=True)



In [8]:

    
#2 각 버스의 노선

busstops = generate_busstops(station_df.drop_duplicates(cols='stationid', take_last=True))
print(busstops)
busroutes = ''
i = 0
for idx, row in station_df.iterrows():
    route = bus_df[bus_df['busno']==row['busno']]
#     print(route)
    busroutes += generate_polyline('poly' + str(i), route, i)
    i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'bus.html')









    



var busstops = [new google.maps.LatLng(37.5128993133, 127.09794576)];

버스가 많은 정차하는 버스 정거장을 클러스러링한다.

버스 정거장 선택
시각화
클러스터링
클러스터링 후 시각화



In [8]:

    
#1 버스 정거장 선택

import pandas as pd

bus_df = pd.read_csv('bus.tsv', sep='\t')

mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']

N_BUSSTOPS = 7

stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
busstop_df









    Out[8]:






  
    
      
      busno
      busid
      stationid
      stationnm
      x
      y
    
  
  
    
      2555 
        1222
       4122200
       08146
                 석계역굴다리앞
       127.065797
       37.614331
    
    
      3463 
        2114
       4211400
       11312
              한진한화그랑빌아파트
       127.069232
       37.615805
    
    
      3464 
        2114
       4211400
       11284
                 석계역2번출구
       127.066973
       37.615274
    
    
      3466 
        2114
       4211400
       08147
                     석계역
       127.067255
       37.615003
    
    
      3496 
        2114
       4211400
       11283
               석계역1번출구.A
       127.064989
       37.614919
    
    
      4102 
        2235
       4223500
       07418
                 중랑공영차고지
       127.103897
       37.613367
    
    
      6019 
        4319
       4431900
       23243
         강남경찰서.강남운전면허시험장
       127.067156
       37.509783
    
    
      6020 
        4319
       4431900
       24154
                 잠실종합운동장
       127.072816
       37.510419
    
    
      6021 
        4319
       4431900
       24157
                종합운동장사거리
       127.079555
       37.511613
    
    
      6022 
        4319
       4431900
       24158
                 신천역4번출구
       127.084495
       37.511407
    
    
      6023 
        4319
       4431900
       24145
              잠실트리지움아파트앞
       127.091202
       37.511512
    
    
      6024 
        4319
       4431900
       24146
                잠실역.롯데월드
       127.098196
       37.512412
    
    
      6028 
        4319
       4431900
       24138
                잠실역.롯데월드
       127.097946
       37.512899
    
    
      6030 
        4319
       4431900
       24141
       잠실2동주민센터 신천역8번출구앞
       127.087281
       37.511772
    
    
      6031 
        4319
       4431900
       24142
                잠실엘스아파트앞
       127.083390
       37.511871
    
    
      6032 
        4319
       4431900
       24143
                종합운동장사거리
       127.079692
       37.512009
    
    
      6033 
        4319
       4431900
       24144
                 잠실종합운동장
       127.072073
       37.511118
    
    
      6193 
        4419
       4441910
       23197
                  한국전력공사
       127.062719
       37.510408
    
    
      6355 
        4434
       4443400
       23196
              강남경찰서면허시험장
       127.065907
       37.509875
    
    
      8784 
        5713
       4571300
       18777
                     석수역
       126.903310
       37.433717
    
    
      8785 
        5713
       4571300
       18013
                  시흥유통센터
       126.903454
       37.440257
    
    
      8786 
        5713
       4571300
       18011
                  금천폭포공원
       126.903273
       37.447843
    
    
      8787 
        5713
       4571300
       18009
                   시흥사거리
       126.901412
       37.452517
    
    
      8788 
        5713
       4571300
       18007
                    금천구청
       126.898872
       37.459209
    
    
      8789 
        5713
       4571300
       18005
                    말미고개
       126.897582
       37.464160
    
    
      8834 
        5713
       4571300
       18006
                    말미고개
       126.897459
       37.464605
    
    
      8835 
        5713
       4571300
       18008
                    금천구청
       126.899269
       37.457674
    
    
      8836 
        5713
       4571300
       18010
                   시흥사거리
       126.901098
       37.452921
    
    
      8837 
        5713
       4571300
       18012
                  금천폭포공원
       126.902658
       37.448908
    
    
      8838 
        5713
       4571300
       18014
                  시흥유통센터
       126.903344
       37.440743
    
    
      8839 
        5713
       4571300
       18776
                     석수역
       126.902729
       37.434643
    
    
      8932 
        5714
       4571400
       17133
                    남구로역
       126.886196
       37.484273
    
    
      8933 
        5714
       4571400
       17132
                   가리봉시장
       126.886993
       37.482429
    
    
      9253 
        6512
       4651200
       17134
                구로4동자치회관
       126.885645
       37.486098
    
    
      9254 
        6512
       4651200
       17135
                    구로시장
       126.884930
       37.487797
    
    
      9255 
        6512
       4651200
       17137
        구로4동우체국.고대구로병원정문
       126.884240
       37.490248
    
    
      9275 
        6512
       4651200
       19010
           강남성심병원.대림성모병원
       126.907900
       37.490871
    
    
      9276 
        6512
       4651200
       19009
           시흥대로.한국광물자원공사
       126.904695
       37.487030
    
    
      9277 
        6512
       4651200
       17013
                구로디지털단지역
       126.901548
       37.483099
    
    
      9303 
        6512
       4651200
       21112
         금천경찰서.신림푸르지오아파트
       126.911240
       37.481362
    
    
      9306 
        6512
       4651200
       21001
                구로디지털단지역
       126.902447
       37.483935
    
    
      9307 
        6512
       4651200
       20001
                신대방성원상떼빌
       126.904626
       37.486505
    
    
      9308 
        6512
       4651200
       20002
                신대방경남아파트
       126.907770
       37.490299
    
    
      9325 
        6512
       4651200
       17138
        구로4동우체국.고대구로병원정문
       126.884065
       37.489902
    
    
      9326 
        6512
       4651200
       17136
                    구로시장
       126.884517
       37.488173
    
    
      9558 
        6515
       4651500
       21127
                   서울대학교
       126.947952
       37.466741
    
    
      9559 
        6515
       4651500
       21142
        신림중.삼성고.관악문화관도서관
       126.944528
       37.470209
    
    
      9560 
        6515
       4651500
       21143
            서울산업정보학교.삼성교
       126.941180
       37.470861
    
    
      9583 
        6515
       4651500
       21157
                신림동고시촌입구
       126.938145
       37.470469
    
    
      9584 
        6515
       4651500
       21158
            서울산업정보학교.삼성교
       126.942028
       37.470574
    
    
      9585 
        6515
       4651500
       21159
          관악산입구.관악문화관도서관
       126.946438
       37.468163
    
    
      10698
        6635
       4663500
       18003
                   금천우체국
       126.898083
       37.469136
    
    
      10699
        6635
       4663500
       18001
                  문성초등학교
       126.898666
       37.475364
    
    
      10701
        6635
       4663500
       18002
                  문성초등학교
       126.898320
       37.473981
    
    
      10702
        6635
       4663500
       18004
                   금천우체국
       126.897935
       37.470079
    
    
      10742
        6637
       4663700
       19119
                   김안과병원
       126.902834
       37.520416
    
    
      10743
        6637
       4663700
       19162
                당산동진로아파트
       126.896878
       37.521326
    
    
      10823
        6638
       4663800
       15258
                    진명여고
       126.865104
       37.523940
    
    
      10828
        6638
       4663800
       15167
                  목동대학학원
       126.873444
       37.524927
    
    
      10869
       6640A
       4664002
       15194
                    양천구청
       126.866428
       37.516298
    
    
      
      ...
      ...
      ...
      ...
      ...
      ...
    
  

81 rows × 6 columns



In [16]:

    
#2 시각화

busstops = generate_busstops(busstop_df)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_origin.html')



In [17]:

    
#3 클러스터링

from sklearn.cluster import KMeans
from sklearn import metrics

print(busstop_df.head())
X = busstop_df[['x', 'y']]
y = busstop_df['busno']
cluster_range = range(2, 15)
vmeasures = []

for n_cluster in cluster_range:

    # km = KMeans(n_clusters=7, init='random', max_iter=100, n_init=1, verbose=1)
    km = KMeans(init='k-means++', n_clusters=n_cluster, n_init=10)

#     print "Clustering sparse data with %s" % km
    km.fit(X)
#     print '----------------------------------------------------'
#     print n_cluster
#     print "Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_)
#     print "Completeness: %0.3f" % metrics.completeness_score(y, km.labels_)
#     print "V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_)
#     print "Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(y, km.labels_)
    vmeasures.append(metrics.v_measure_score(y, km.labels_))
#     vmeasures.append(metrics.silhouette_score(X, km.labels_, metric='euclidean'))
    
import matplotlib.pyplot as plt
plt.plot(cluster_range, vmeasures)
    
plt.xlabel('# cluster')
plt.ylabel('v measure')
plt.autoscale(tight=True)
plt.grid()
plt.show()









    



     busno    busid stationid   stationnm           x          y
2555  1222  4122200     08146     석계역굴다리앞  127.065797  37.614331
3463  2114  4211400     11312  한진한화그랑빌아파트  127.069232  37.615805
3464  2114  4211400     11284     석계역2번출구  127.066973  37.615274
3466  2114  4211400     08147         석계역  127.067255  37.615003
3496  2114  4211400     11283   석계역1번출구.A  127.064989  37.614919

[5 rows x 6 columns]



In [20]:

    
#4 클러스터링 후 시각화
best_clusters = 7
km = KMeans(init='k-means++', n_clusters=best_clusters, n_init=10)
km.fit(X)
# print km.cluster_centers_
centers = pd.DataFrame(km.cluster_centers_, columns=['x', 'y'])

busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')

DBSCAN로 이상치인 홀로 떨어진 빈도수 높은 버스 정거장 찾기

DBSCAN로 이상치 찾기
시각화



In [15]:

    
#1. DBSCAN로 이상치 찾기
from collections import namedtuple

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer
from sklearn.cluster import DBSCAN

X['sx'] = X.x
X['sy'] = X.y

ss = StandardScaler()
X['sx'] = ss.fit_transform(X.sx)
X['sy'] = ss.fit_transform(X.sy)
# print(X)

Param = namedtuple('Param', ['eps', 'min_samples'])
params = [Param(0.45, 2), Param(0.30, 2), Param(0.35, 2), Param(0.40, 2), 
            Param(0.45, 4), Param(0.30, 4), Param(0.35, 4), Param(0.40, 4), 
            Param(0.45, 3), Param(0.30, 3), Param(0.35, 3), Param(0.40, 3)]
# print(X.values)
for param in params:
    dbscan = DBSCAN(eps=param.eps, min_samples=param.min_samples).fit(X[['sx', 'sy']].values)
    labels = dbscan.labels_
    outliers = X[labels == -1]
    
    print(param)
#     print(labels)
    print(outliers)
#     print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
    print(metrics.silhouette_score(X, labels, metric='euclidean'))

# busstops = generate_busstops(outliers)
# busroutes = ''
    
# data = {'busstops': busstops, 'busroutes': busroutes}
# generate_template_html(data, 'busstop_cluster_outlier.html')









    



-c:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
-c:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
/media/riemann/dedekind/local/lib/python2.7/site-packages/numpy/core/_methods.py:55: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)






    



Param(eps=0.45, min_samples=2)
                x          y        sx        sy
11401  126.801262  37.565235 -1.773647  1.086223

[1 rows x 4 columns]
0.578956767951
Param(eps=0.3, min_samples=2)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[3 rows x 4 columns]
0.569195243662
Param(eps=0.35, min_samples=2)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[3 rows x 4 columns]
0.557622991947
Param(eps=0.4, min_samples=2)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223

[2 rows x 4 columns]
0.559847187262
Param(eps=0.45, min_samples=4)
                x          y        sx        sy
11401  126.801262  37.565235 -1.773647  1.086223

[1 rows x 4 columns]
0.578956767951
Param(eps=0.3, min_samples=4)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
10823  126.865104  37.523940 -0.963517  0.236850
10828  126.873444  37.524927 -0.857686  0.257150
10869  126.866428  37.516298 -0.946719  0.079653
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[6 rows x 4 columns]
0.538119913613
Param(eps=0.35, min_samples=4)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[3 rows x 4 columns]
0.557622991947
Param(eps=0.4, min_samples=4)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223

[2 rows x 4 columns]
0.559847187262
Param(eps=0.45, min_samples=3)
                x          y        sx        sy
11401  126.801262  37.565235 -1.773647  1.086223

[1 rows x 4 columns]
0.578956767951
Param(eps=0.3, min_samples=3)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[3 rows x 4 columns]
0.569195243662
Param(eps=0.35, min_samples=3)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223
11542  126.837665  37.508486 -1.311701 -0.081022

[3 rows x 4 columns]
0.557622991947
Param(eps=0.4, min_samples=3)
                x          y        sx        sy
4102   127.103897  37.613367  2.066672  2.076246
11401  126.801262  37.565235 -1.773647  1.086223

[2 rows x 4 columns]
0.559847187262



In [22]:

    
#2 시각화
best_param = Param(eps=0.3, min_samples=3)
dbscan = DBSCAN(eps=best_param.eps, min_samples=best_param.min_samples).fit(X[['sx', 'sy']].values)
labels = dbscan.labels_
outliers = X[labels == -1]

busstops = generate_busstops(outliers)
busroutes = ''
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_outlier.html')

mean shift 적용하기

mean shift
시각화



In [18]:

    
#1 mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth

import pandas as pd

bus_df = pd.read_csv('bus.tsv', sep='\t')

mbus_df = bus_df[bus_df['stationid']!='0']
mbus_df = mbus_df[mbus_df['stationid']!='미정차']
mbus_df = mbus_df[mbus_df['stationid']!='35331']

N_BUSSTOPS = 7

stops = mbus_df['stationid'].value_counts()
d = stops[stops > N_BUSSTOPS].index.values.tolist()
mbus_df = mbus_df[mbus_df['stationid'].isin(d)]
busstop_df = mbus_df.drop_duplicates(cols='stationid', take_last=True)
# busstop_df
X = busstop_df[['x', 'y']]
y = busstop_df['busno']

print(X.shape)
bandwidth = estimate_bandwidth(X.values, quantile=0.2, n_samples=30)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X.values)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
print(ms.cluster_centers_)









    



(81, 2)
number of estimated clusters : 5
[[ 126.90469961   37.47674268]
 [ 126.89581985   37.54925483]
 [ 127.08018682   37.51131514]
 [ 127.07302401   37.61478327]
 [ 126.80126158   37.5652345 ]]



In [ ]:

    
#2 클러스터링 후 시각화

centers = pd.DataFrame(ms.cluster_centers_, columns=['x', 'y'])

busstops = generate_busstops(centers)
busroutes = ''
# i = 0
# for idx, row in busstop_df.iterrows():
#     route = bus_df[bus_df['busno']==row['busno']]
#     busroutes += generate_polyline('poly' + str(i), route, i)
#     i += 1
    
data = {'busstops': busstops, 'busroutes': busroutes}
generate_template_html(data, 'busstop_cluster_centers.html')

	busno	busid	stationid	stationnm	x	y
2555	1222	4122200	08146	석계역굴다리앞	127.065797	37.614331
3463	2114	4211400	11312	한진한화그랑빌아파트	127.069232	37.615805
3464	2114	4211400	11284	석계역2번출구	127.066973	37.615274
3466	2114	4211400	08147	석계역	127.067255	37.615003
3496	2114	4211400	11283	석계역1번출구.A	127.064989	37.614919
4102	2235	4223500	07418	중랑공영차고지	127.103897	37.613367
6019	4319	4431900	23243	강남경찰서.강남운전면허시험장	127.067156	37.509783
6020	4319	4431900	24154	잠실종합운동장	127.072816	37.510419
6021	4319	4431900	24157	종합운동장사거리	127.079555	37.511613
6022	4319	4431900	24158	신천역4번출구	127.084495	37.511407
6023	4319	4431900	24145	잠실트리지움아파트앞	127.091202	37.511512
6024	4319	4431900	24146	잠실역.롯데월드	127.098196	37.512412
6028	4319	4431900	24138	잠실역.롯데월드	127.097946	37.512899
6030	4319	4431900	24141	잠실2동주민센터 신천역8번출구앞	127.087281	37.511772
6031	4319	4431900	24142	잠실엘스아파트앞	127.083390	37.511871
6032	4319	4431900	24143	종합운동장사거리	127.079692	37.512009
6033	4319	4431900	24144	잠실종합운동장	127.072073	37.511118
6193	4419	4441910	23197	한국전력공사	127.062719	37.510408
6355	4434	4443400	23196	강남경찰서면허시험장	127.065907	37.509875
8784	5713	4571300	18777	석수역	126.903310	37.433717
8785	5713	4571300	18013	시흥유통센터	126.903454	37.440257
8786	5713	4571300	18011	금천폭포공원	126.903273	37.447843
8787	5713	4571300	18009	시흥사거리	126.901412	37.452517
8788	5713	4571300	18007	금천구청	126.898872	37.459209
8789	5713	4571300	18005	말미고개	126.897582	37.464160
8834	5713	4571300	18006	말미고개	126.897459	37.464605
8835	5713	4571300	18008	금천구청	126.899269	37.457674
8836	5713	4571300	18010	시흥사거리	126.901098	37.452921
8837	5713	4571300	18012	금천폭포공원	126.902658	37.448908
8838	5713	4571300	18014	시흥유통센터	126.903344	37.440743
8839	5713	4571300	18776	석수역	126.902729	37.434643
8932	5714	4571400	17133	남구로역	126.886196	37.484273
8933	5714	4571400	17132	가리봉시장	126.886993	37.482429
9253	6512	4651200	17134	구로4동자치회관	126.885645	37.486098
9254	6512	4651200	17135	구로시장	126.884930	37.487797
9255	6512	4651200	17137	구로4동우체국.고대구로병원정문	126.884240	37.490248
9275	6512	4651200	19010	강남성심병원.대림성모병원	126.907900	37.490871
9276	6512	4651200	19009	시흥대로.한국광물자원공사	126.904695	37.487030
9277	6512	4651200	17013	구로디지털단지역	126.901548	37.483099
9303	6512	4651200	21112	금천경찰서.신림푸르지오아파트	126.911240	37.481362
9306	6512	4651200	21001	구로디지털단지역	126.902447	37.483935
9307	6512	4651200	20001	신대방성원상떼빌	126.904626	37.486505
9308	6512	4651200	20002	신대방경남아파트	126.907770	37.490299
9325	6512	4651200	17138	구로4동우체국.고대구로병원정문	126.884065	37.489902
9326	6512	4651200	17136	구로시장	126.884517	37.488173
9558	6515	4651500	21127	서울대학교	126.947952	37.466741
9559	6515	4651500	21142	신림중.삼성고.관악문화관도서관	126.944528	37.470209
9560	6515	4651500	21143	서울산업정보학교.삼성교	126.941180	37.470861
9583	6515	4651500	21157	신림동고시촌입구	126.938145	37.470469
9584	6515	4651500	21158	서울산업정보학교.삼성교	126.942028	37.470574
9585	6515	4651500	21159	관악산입구.관악문화관도서관	126.946438	37.468163
10698	6635	4663500	18003	금천우체국	126.898083	37.469136
10699	6635	4663500	18001	문성초등학교	126.898666	37.475364
10701	6635	4663500	18002	문성초등학교	126.898320	37.473981
10702	6635	4663500	18004	금천우체국	126.897935	37.470079
10742	6637	4663700	19119	김안과병원	126.902834	37.520416
10743	6637	4663700	19162	당산동진로아파트	126.896878	37.521326
10823	6638	4663800	15258	진명여고	126.865104	37.523940
10828	6638	4663800	15167	목동대학학원	126.873444	37.524927
10869	6640A	4664002	15194	양천구청	126.866428	37.516298
	...	...	...	...	...	...