In [1]:
import os
import pandas as pd

In [2]:
df = pd.read_csv('../data/csv/DC_Tweets_5days_CSV.csv')
#df.head


/Users/jacquelinekazil/Projects/envs/class/lib/python2.7/site-packages/pandas/io/parsers.py:1164: DtypeWarning: Columns (23) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
lookup_keys=['X','Y']

K-means samples


In [5]:
from sklearn.cluster import KMeans

df = df
def graph_kmeans(clusters, lookup_keys=['X','Y']):
    k_means = KMeans(clusters)  # number of clusters
    output = k_means.fit(df[lookup_keys]).labels_
    
    fig, ax = plt.subplots(figsize=(5, 5),
                           subplot_kw={'axisbelow':True})
    ax.scatter(df['X'], df['Y'], c=output)
    return

In [11]:
graph_kmeans(2)
graph_kmeans(3)
graph_kmeans(4)
graph_kmeans(5)
graph_kmeans(6)
graph_kmeans(10)
graph_kmeans(15)
graph_kmeans(25)



In [40]:
import numpy as np

In [41]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [42]:
def plot_dbscan(df, n_recs):
    df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
    lookup_keys=['X','Y']
    X = StandardScaler().fit_transform(df_subset[lookup_keys])
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)  
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)

    ##############################################################################
    # Plot result
    import matplotlib.pyplot as plt

    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]  
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=6)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()

In [48]:
#plot_dbscan(df, n_recs)
plot_dbscan(df, 100)
plot_dbscan(df, 500)
plot_dbscan(df, 1000)
plot_dbscan(df, 10000)
plot_dbscan(df, 20000)


Estimated number of clusters: 2
Estimated number of clusters: 2
Estimated number of clusters: 4
Estimated number of clusters: 10
Estimated number of clusters: 13

Affinity Propagation


In [6]:
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs

##############################################################################
# Compute Affinity Propagation

def compute_affinity(df, n_recs, preference=-10):
    df_subset = df.loc[np.random.choice(df.index, n_recs, replace=False)]
    df = pd.DataFrame(np.random.rand(4,5), columns = list('abcde'))

    X = df_subset[lookup_keys]
    X = X.as_matrix()
    af = AffinityPropagation(preference=preference).fit(X)
    
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_

    n_clusters_ = len(cluster_centers_indices)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f"
          % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

    ##############################################################################
    # Plot result
    from itertools import cycle

    plt.figure(1)
    plt.clf()

    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        class_members = labels == k
        cluster_center = X[cluster_centers_indices[k]]

        
        plt.plot(X[class_members, 0], X[class_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=8)
        for x in X[class_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
    return

The affinities below are all the same sample size. The only difference is the actual sample pulled.


In [7]:
compute_affinity(df, 1000)  #preference = -10


Estimated number of clusters: 406
Silhouette Coefficient: 0.117
/Users/jacquelinekazil/Projects/envs/class/lib/python2.7/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)

In [8]:
compute_affinity(df, 1000)  #preference = -10


Estimated number of clusters: 5
Silhouette Coefficient: 0.721

In [9]:
compute_affinity(df, 1000) #preference = -10


Estimated number of clusters: 830
Silhouette Coefficient: 0.053

In [11]:
compute_affinity(df, 1000) #preference = -10


Estimated number of clusters: 481
Silhouette Coefficient: 0.158

In [12]:
compute_affinity(df, 2000)


Estimated number of clusters: 563
Silhouette Coefficient: 0.282

In [13]:
compute_affinity(df, 100)


Estimated number of clusters: 26
Silhouette Coefficient: 0.206

In [14]:
compute_affinity(df, 100)


Estimated number of clusters: 2
Silhouette Coefficient: 0.438

In [ ]: