In [ ]:
import pandas as pd
import re #a package for doing regex
import glob #for accessing files on our local system

We'll be using data from http://www.cs.cornell.edu/home/llee/data/convote.html to explore k-means clustering


In [ ]:
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz

In [ ]:
!tar -zxvf convote_v1.1.tar.gz

In [ ]:
paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
    speech = {}
    filename = path[-26:]
    speech['filename'] = filename
    speech['bill_no'] = filename[:3]
    speech['speaker_no'] = filename[4:10]
    speech['bill_vote'] = filename[-5]
    speech['party'] = filename[-7]
    
    # Open the file
    speech_file = open(path, 'r')
    # Read the stuff out of it
    speech['contents'] = speech_file.read()

    cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
    cleaned_contents = re.sub(r" +",' ', cleaned_contents)
    cleaned_contents = cleaned_contents.strip()
    words = cleaned_contents.split(' ')
    speech['word_count'] = len(words)
    
    speeches.append(speech)

In [ ]:
speeches[:5]

In [ ]:
speeches_df = pd.DataFrame(speeches)
speeches_df.head()

In [ ]:
speeches_df["word_count"].describe()

Notice that we have a lot of speeches that are relatively short. They probably aren't the best for clustering because of their brevity

Time to bring the TF-IDF vectorizer


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92] 
#filtering for word counts greater than 92 (our median length)
X = vectorizer.fit_transform(longer_speeches['contents'])

In [ ]:
from sklearn.cluster import KMeans

In [ ]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

In [ ]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''

In [ ]:


In [ ]:
additional_stopwords = ['mr','congress','chairman','madam','amendment','legislation','speaker']

In [ ]:
import nltk

english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords

In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)

In [ ]:
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])

In [ ]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

In [ ]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''

In [ ]:
longer_speeches["k-means label"] = km.labels_

In [ ]:
longer_speeches.head()

In [ ]:
china_speeches = longer_speeches[longer_speeches["k-means label"] == 1]

In [ ]:
china_speeches.head()

In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
X = vectorizer.fit_transform(china_speeches['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print ''

In [ ]:
km.get_params()

In [ ]:
km.score(X)

In [ ]: