In [ ]:
import pandas as pd
import re #a package for doing regex
import glob #for accessing files on our local system
In [ ]:
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
In [ ]:
!tar -zxvf convote_v1.1.tar.gz
In [ ]:
paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
speech = {}
filename = path[-26:]
speech['filename'] = filename
speech['bill_no'] = filename[:3]
speech['speaker_no'] = filename[4:10]
speech['bill_vote'] = filename[-5]
speech['party'] = filename[-7]
# Open the file
speech_file = open(path, 'r')
# Read the stuff out of it
speech['contents'] = speech_file.read()
cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
cleaned_contents = re.sub(r" +",' ', cleaned_contents)
cleaned_contents = cleaned_contents.strip()
words = cleaned_contents.split(' ')
speech['word_count'] = len(words)
speeches.append(speech)
In [ ]:
speeches[:5]
In [ ]:
speeches_df = pd.DataFrame(speeches)
speeches_df.head()
In [ ]:
speeches_df["word_count"].describe()
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
#filtering for word counts greater than 92 (our median length)
X = vectorizer.fit_transform(longer_speeches['contents'])
In [ ]:
from sklearn.cluster import KMeans
In [ ]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
In [ ]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :15]:
print(' %s' % terms[ind]),
print ''
In [ ]:
In [ ]:
additional_stopwords = ['mr','congress','chairman','madam','amendment','legislation','speaker']
In [ ]:
import nltk
english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords
In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
In [ ]:
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])
In [ ]:
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
In [ ]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :15]:
print(' %s' % terms[ind]),
print ''
In [ ]:
longer_speeches["k-means label"] = km.labels_
In [ ]:
longer_speeches.head()
In [ ]:
china_speeches = longer_speeches[longer_speeches["k-means label"] == 1]
In [ ]:
china_speeches.head()
In [ ]:
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
X = vectorizer.fit_transform(china_speeches['contents'])
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print ''
In [ ]:
km.get_params()
In [ ]:
km.score(X)
In [ ]: