In [1]:
import os
import pandas as pd
In [2]:
df = pd.read_csv('../data/csv/DC_Tweets_5days_CSV.csv')
#df.head
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
lookup_keys=['X','Y']
In [5]:
from sklearn.cluster import KMeans
df = df
def graph_kmeans(clusters, lookup_keys=['X','Y']):
k_means = KMeans(clusters) # number of clusters
output = k_means.fit(df[lookup_keys]).labels_
fig, ax = plt.subplots(figsize=(5, 5),
subplot_kw={'axisbelow':True})
ax.scatter(df['X'], df['Y'], c=output)
return
In [11]:
graph_kmeans(2)
graph_kmeans(3)
graph_kmeans(4)
graph_kmeans(5)
graph_kmeans(6)
graph_kmeans(10)
graph_kmeans(15)
graph_kmeans(25)
In [40]:
import numpy as np
In [41]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
In [42]:
def plot_dbscan(df, n_recs):
df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
lookup_keys=['X','Y']
X = StandardScaler().fit_transform(df_subset[lookup_keys])
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
##############################################################################
# Plot result
import matplotlib.pyplot as plt
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [48]:
#plot_dbscan(df, n_recs)
plot_dbscan(df, 100)
plot_dbscan(df, 500)
plot_dbscan(df, 1000)
plot_dbscan(df, 10000)
plot_dbscan(df, 20000)
In [6]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
##############################################################################
# Compute Affinity Propagation
def compute_affinity(df, n_recs, preference=-10):
df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
df = pd.DataFrame(np.random.rand(4,5), columns = list('abcde'))
X = df_subset[lookup_keys]
X = X.as_matrix()
af = AffinityPropagation(preference=preference).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, metric='sqeuclidean'))
##############################################################################
# Plot result
from itertools import cycle
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
class_members = labels == k
cluster_center = X[cluster_centers_indices[k]]
plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=8)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
return
The affinities below are all the same sample size. The only difference is the actual sample pulled.
In [7]:
compute_affinity(df, 1000) #preference = -10
In [8]:
compute_affinity(df, 1000) #preference = -10
In [9]:
compute_affinity(df, 1000) #preference = -10
In [11]:
compute_affinity(df, 1000) #preference = -10
In [12]:
compute_affinity(df, 2000)
In [13]:
compute_affinity(df, 100)
In [14]:
compute_affinity(df, 100)
In [ ]: