Make some observation, compare different method and parameters, explain the result and see if the output meet your expect.
In [1]:
from sqlalchemy import create_engine
engine = create_engine('mysql://calee:110010@localhost/citybike')
In [2]:
import pandas as pd
station = pd.read_sql_table(table_name='station', con=engine)
station
Out[2]:
In [3]:
import pandas as pd
import datetime
import numpy as np
query = '''
SELECT * FROM in_out
WHERE time between "2017-07-01" AND "2017-07-07 23:59:59"
ORDER BY time;
'''
flow = pd.read_sql_query(query, con=engine)
flow['time'] = flow['time'] - datetime.datetime(2017, 7, 1, 0, 0, 0)
flow['time'] = flow['time'] / np.timedelta64(1, 's')
flow['time'] = flow[['time']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
flow
Out[3]:
kmeans 切出中心點 並 作圖
In [4]:
from mpl_toolkits.basemap import Basemap
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
my_map = Basemap(projection='merc', lat_0=40.7, lon_0=-73.98,
resolution = 'h', area_thresh = 0.01,
llcrnrlon=-74.1, llcrnrlat=40.64,
urcrnrlon=-73.9, urcrnrlat=40.85)
def draw_kmeans(num):
kmeans = KMeans(n_clusters=num, random_state=0).fit(station[['latitude','longitude']])
area = kmeans.cluster_centers_
lon = station['longitude'].tolist()
lat = station['latitude'].tolist()
cen_lon = [ a[0] for a in area[:,[1]] ]
cen_lat = [ a[0] for a in area[:,[0]] ]
labels = station['id'].tolist()
fig = plt.figure(frameon=False)
fig.set_size_inches(15,10)
my_map.drawcoastlines()
my_map.drawcountries()
my_map.fillcontinents(color='coral')
my_map.drawmapboundary()
x,y = my_map(lon, lat)
my_map.plot(x, y, 'bo', markersize=1)
x,y = my_map(cen_lon, cen_lat)
my_map.plot(x, y, 'g+', markersize=10)
plt.show()
for num in (2, 3, 5, 10, 15, 20, 30):
draw_kmeans(num)
由上面的資料顯示,用 kmeans 時,將 n 設為 20 似乎是不錯的分群,可以大略把每一個小群集切割出來
DBScan 做出分群 並 作圖
In [6]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
import numpy as np
from sklearn import metrics
import matplotlib.cm as cm
def draw_DBSCAN(method, min_sup, eps):
dbscan = DBSCAN(eps=eps, min_samples=min_sup, metric=method).fit(station[['latitude','longitude']])
lb = dbscan.labels_
lbmx = max(lb)
llat = []
llon = []
for i in range(-1, lbmx+1):
la = []
lo = []
for idx, j in enumerate(lb):
if i == j:
lo.append(station['longitude'].tolist()[idx])
la.append(station['latitude'].tolist()[idx])
llat.append(la)
llon.append(lo)
# draw
my_map = Basemap(projection='merc', lat_0=40.7, lon_0=-73.98,
resolution = 'h', area_thresh = 0.01,
llcrnrlon=-74.1, llcrnrlat=40.64,
urcrnrlon=-73.9, urcrnrlat=40.85)
lon = station['longitude'].tolist()
lat = station['latitude'].tolist()
labels = station['id'].tolist()
fig = plt.figure(frameon=False)
fig.set_size_inches(15,10)
my_map.drawcoastlines()
my_map.drawcountries()
my_map.fillcontinents(color='coral')
my_map.drawmapboundary()
colors = cm.rainbow(np.linspace(0, 1, lbmx+2))
#print(colors)
#colors = np.array([0,0,0,1]) + colors
#print(colors)
for i in range(lbmx+2):
x,y = my_map(llon[i], llat[i])
my_map.plot(x, y, color=colors[i], markersize=2, marker='o', linestyle='')
plt.show()
for method in ['euclidean']:
for eps in (0.002, 0.004, 0.006):
for min_sup in (5, 10, 15):
# if eps == 0.004 and min_sup == 5: continue
print('method=', method)
print('eps=', eps)
print('min_support=', min_sup)
draw_DBSCAN(method=method, min_sup=min_sup, eps=eps)
由上面的圖可以看出,若 eps 與 min_support 太低,容易出現非常方散,沒有鑑別度的分群,而若 eps 與 min_support 太大,又容易出現把全部點化成同一區,完全沒有分群到。
在這次的資料中,看起來似乎設定 eps = 0.004 min_support = 2,eps = 0.006 min_support = 10,eps = 0.006 min_support = 5,eps = 0.06 min_support = 2,eps = 0.06 min_support = 1 等,都可以切出比較不錯的分群。
In [7]:
from sklearn.cluster import AgglomerativeClustering
array = []
for method in ('ward', 'complete', 'average'):
for aff in ('euclidean', 'l1', 'l2', 'manhattan', 'cosine'):
if method == 'ward' and not aff == 'euclidean':
continue
agg = AgglomerativeClustering(n_clusters=2, affinity=aff, linkage=method, memory="agg_cache")
agg.fit(flow.iloc[1:][:20000])
array.append(agg.labels_)
array
Out[7]:
In [8]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
def randrange(n, vmin, vmax):
'''
Helper function to make an array of random numbers having shape (n, )
with each number distributed Uniform(vmin, vmax).
'''
return (vmax - vmin)*np.random.rand(n) + vmin
def draw(data):
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
n = 100
# For each set of style and range settings, plot n random points in the box
# defined by x in [23, 32], y in [0, 100], z in [zlow, zhigh].
xs1 = np.array([])
xs2 = np.array([])
ys1 = np.array([])
ys2 = np.array([])
zs1 = np.array([])
zs2 = np.array([])
s1 = []
s2 = []
for idx, d in enumerate(data):
#print(flow['time'][idx])
if d == 0:
xs1 = np.append(xs1, flow['time'][idx])
ys1 = np.append(ys1, flow['in'][idx])
zs1 = np.append(zs1, flow['out'][idx])
s1.append(1)
else:
xs2 = np.append(xs2, flow['time'][idx])
ys2 = np.append(ys2, flow['in'][idx])
zs2 = np.append(zs2, flow['out'][idx])
s2.append(1)
#print(xs1)
# for c, m, zlow, zhigh in [('r', 'o', -50, -25), ('b', '^', -30, -5)]:
# xs = randrange(n, 23, 32)
# ys = randrange(n, 0, 100)
# zs = randrange(n, zlow, zhigh)
ax.scatter(xs1, ys1, zs1, c='r', marker='o', s=s1)
ax.scatter(xs2, ys2, zs2, c='b', marker='^', s=s2)
ax.set_xlabel('time stemp')
ax.set_ylabel('in-flow')
ax.set_zlabel('out-flow')
plt.show()
for tmp in array:
#print(tmp)
draw(tmp)
不管用任何算法,感覺看不出明顯的分佈差異
In [23]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3).fit(flow.iloc[1:])
print(pca.components_)
In [ ]: