In [1]:
    
import os
import pandas as pd
    
In [2]:
    
df = pd.read_csv('../data/csv/DC_Tweets_5days_CSV.csv')
#df.head
    
    
In [3]:
    
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
lookup_keys=['X','Y']
    
In [5]:
    
from sklearn.cluster import KMeans
df = df
def graph_kmeans(clusters, lookup_keys=['X','Y']):
    k_means = KMeans(clusters)  # number of clusters
    output = k_means.fit(df[lookup_keys]).labels_
    
    fig, ax = plt.subplots(figsize=(5, 5),
                           subplot_kw={'axisbelow':True})
    ax.scatter(df['X'], df['Y'], c=output)
    return
    
In [11]:
    
graph_kmeans(2)
graph_kmeans(3)
graph_kmeans(4)
graph_kmeans(5)
graph_kmeans(6)
graph_kmeans(10)
graph_kmeans(15)
graph_kmeans(25)
    
    
    
    
    
    
    
    
    
In [40]:
    
import numpy as np
    
In [41]:
    
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
    
In [42]:
    
def plot_dbscan(df, n_recs):
    df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
    lookup_keys=['X','Y']
    X = StandardScaler().fit_transform(df_subset[lookup_keys])
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)  
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print('Estimated number of clusters: %d' % n_clusters_)
    ##############################################################################
    # Plot result
    import matplotlib.pyplot as plt
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'
        class_member_mask = (labels == k)
        xy = X[class_member_mask & core_samples_mask]  
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)
        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
    
In [48]:
    
#plot_dbscan(df, n_recs)
plot_dbscan(df, 100)
plot_dbscan(df, 500)
plot_dbscan(df, 1000)
plot_dbscan(df, 10000)
plot_dbscan(df, 20000)
    
    
    
    
    
    
    
    
    
    
    
In [6]:
    
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
##############################################################################
# Compute Affinity Propagation
def compute_affinity(df, n_recs, preference=-10):
    df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
    df = pd.DataFrame(np.random.rand(4,5), columns = list('abcde'))
    X = df_subset[lookup_keys]
    X = X.as_matrix()
    af = AffinityPropagation(preference=preference).fit(X)
    
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)
    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))
    ##############################################################################
    # Plot result
    from itertools import cycle
    plt.figure(1)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]
        
        plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=8)
        for x in X[class_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
    return
    
The affinities below are all the same sample size. The only difference is the actual sample pulled.
In [7]:
    
compute_affinity(df, 1000)  #preference = -10
    
    
    
    
In [8]:
    
compute_affinity(df, 1000)  #preference = -10
    
    
    
In [9]:
    
compute_affinity(df, 1000) #preference = -10
    
    
    
In [11]:
    
compute_affinity(df, 1000) #preference = -10
    
    
    
In [12]:
    
compute_affinity(df, 2000)
    
    
    
In [13]:
    
compute_affinity(df, 100)
    
    
    
In [14]:
    
compute_affinity(df, 100)
    
    
    
In [ ]: