In [ ]:

    
import pandas as pd
import re #a package for doing regex
import glob #for accessing files on our local system

We'll be using data from http://www.cs.cornell.edu/home/llee/data/convote.html to explore k-means clustering



In [ ]:

    
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz



In [ ]:

    
!tar -zxvf convote_v1.1.tar.gz



In [ ]:

    
paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
    speech = {}
    filename = path[-26:]
    speech['filename'] = filename
    speech['bill_no'] = filename[:3]
    speech['speaker_no'] = filename[4:10]
    speech['bill_vote'] = filename[-5]
    speech['party'] = filename[-7]
    
    # Open the file
    speech_file = open(path, 'r')
    # Read the stuff out of it
    speech['contents'] = speech_file.read()

    cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
    cleaned_contents = re.sub(r" +",' ', cleaned_contents)
    cleaned_contents = cleaned_contents.strip()
    words = cleaned_contents.split(' ')
    speech['word_count'] = len(words)
    
    speeches.append(speech)



In [ ]:

    
speeches[:5]



In [ ]:

    
speeches_df = pd.DataFrame(speeches)
speeches_df.head()



In [ ]:

    
speeches_df["word_count"].describe()

Notice that we have a lot of speeches that are relatively short. They probably aren't the best for clustering because of their brevity

Time to bring the TF-IDF vectorizer



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [ ]:

    
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92] 
#filtering for word counts greater than 92 (our median length)
X = vectorizer.fit_transform(longer_speeches['contents'])



In [ ]:

    
from sklearn.cluster import KMeans



In [ ]:

    
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)



In [ ]:

    
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''



In [ ]:



In [ ]:

    
additional_stopwords = ['mr','congress','chairman','madam','amendment','legislation','speaker']



In [ ]:

    
import nltk

english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords



In [ ]:

    
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)



In [ ]:

    
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])



In [ ]:

    
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)



In [ ]:

    
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind]),
    print ''



In [ ]:

    
longer_speeches["k-means label"] = km.labels_



In [ ]:

    
longer_speeches.head()



In [ ]:

    
china_speeches = longer_speeches[longer_speeches["k-means label"] == 1]



In [ ]:

    
china_speeches.head()



In [ ]:

    
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
X = vectorizer.fit_transform(china_speeches['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print ''



In [ ]:

    
km.get_params()



In [ ]:

    
km.score(X)



In [ ]: