In [1]:
# Import all of the things you need to import!
!pip install numpy
!pip install scipy
!pip install sklearn
!pip install nltk
In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer
pd.options.display.max_columns = 30
%matplotlib inline
The Congressional Record is more or less what happened in Congress every single day. Speeches and all that. A good large source of text data, maybe?
Let's pretend it's totally secret but we just got it leaked to us in a data dump, and we need to check it out. It was leaked from this page here.
In [2]:
# If you'd like to download it through the command line...
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
In [ ]:
# And then extract it through the command line...
!tar -zxf convote_v1.1.tar.gz
You can explore the files if you'd like, but we're going to get the ones from convote_v1.1/data_stage_one/development_set/
. It's a bunch of text files.
In [7]:
# glob finds files matching a certain filename pattern
import glob
# Give me all the text files
paths = glob.glob('convote_v1.1/data_stage_one/development_set/*')
paths[:5]
Out[7]:
In [8]:
len(paths)
Out[8]:
So great, we have 702 of them. Now let's import them.
In [9]:
speeches = []
for path in paths:
with open(path) as speech_file:
speech = {
'pathname': path,
'filename': path.split('/')[-1],
'content': speech_file.read()
}
speeches.append(speech)
speeches_df = pd.DataFrame(speeches)
speeches_df.head()
Out[9]:
In class we had the texts
variable. For the homework can just do speeches_df['content']
to get the same sort of list of stuff.
Take a look at the contents of the first 5 speeches
In [11]:
speeches_df['content'].head(5)
Out[11]:
In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(speeches_df['content'])
X.toarray()
pd.DataFrame(X.toarray())
pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
Out[17]:
Okay, it's far too big to even look at. Let's try to get a list of features from a new CountVectorizer
that only takes the top 100 words.
In [18]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words='english',max_features=100)
X = count_vectorizer.fit_transform(speeches_df['content'])
X.toarray()
pd.DataFrame(X.toarray())
pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
Out[18]:
In [ ]:
In [ ]:
Now let's push all of that into a dataframe with nicely named columns.
In [19]:
df=pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
Everyone seems to start their speeches with "mr chairman" - how many speeches are there total, and many don't mention "chairman" and how many mention neither "mr" nor "chairman"?
In [27]:
df['chairman'].value_counts().head(5) #Chairman is NOT mentioned in 250 speeches.
Out[27]:
In [29]:
len(df[df['chairman']==0])
Out[29]:
In [35]:
len(df[(df['chairman']==0) & (df['mr']==0)])
Out[35]:
In [ ]:
What is the index of the speech thank is the most thankful, a.k.a. includes the word 'thank' the most times?
In [36]:
df['thank'].max()
Out[36]:
In [37]:
df[df['thank']==9] #Speech No 9
Out[37]:
If I'm searching for China
and trade
, what are the top 3 speeches to read according to the CountVectoriser
?
In [59]:
ctdf=df[(df['china']!=0) & (df['trade']!=0)]
nctdf=pd.DataFrame([ctdf['china'], ctdf['trade'], ctdf['china'] + ctdf['trade']], index=["china", "trade", "china+trade"]).T
In [67]:
nctdf.sort_values(by='china+trade',ascending=False).head(3)
Out[67]:
Now what if I'm using a TfidfVectorizer
?
In [69]:
porter_stemmer = PorterStemmer()
def stemming_tokenizer(str_input):
words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
words = [porter_stemmer.stem(word) for word in words]
return words
In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=stemming_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(speeches_df['content'])
newdf=pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
newdf.head(5)
Out[71]:
What's the content of the speeches? Here's a way to get them:
In [72]:
# index 0 is the first speech, which was the first one imported.
paths[0]
Out[72]:
In [91]:
# Pass that into 'cat' using { } which lets you put variables in shell commands
# that way you can pass the path to cat
!type "convote_v1.1\data_stage_one\development_set\052_400011_0327014_DON.txt"
#!type "{paths[0].replace("/","\\")}"
In [90]:
!type "{paths[0].replace("/","\\")}"
Now search for something else! Another two terms that might show up. elections
and chaos
? Whatever you thnik might be interesting.
In [105]:
ecdf=pd.DataFrame([newdf['elect'], newdf['chao'], newdf['elect'] + newdf['chao']], index=["elections", "chaos", "elections+chaos"]).T
In [107]:
ecdf.sort_values(by='elections+chaos',ascending=False).head(5)
Out[107]:
Using a simple counting vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
Using a term frequency vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
Using a term frequency inverse document frequency vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
In [113]:
#SIMPLE COUNTING VECTORIZER
count_vectorizer = CountVectorizer(stop_words='english',max_features=100)
X = count_vectorizer.fit_transform(speeches_df['content'])
from sklearn.cluster import KMeans
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
Out[113]:
In [114]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
In [ ]:
In [109]:
#SIMPLE TERM FREQUENCY VECTORIZER
vectorizer = TfidfVectorizer(use_idf=True, tokenizer=stemming_tokenizer, stop_words='english')
X = vectorizer.fit_transform(speeches_df['content'])
from sklearn.cluster import KMeans
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
Out[109]:
In [110]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
In [120]:
# SIMPLE TFIDF VECTORIZER
vectorizer = TfidfVectorizer(tokenizer=stemming_tokenizer,use_idf=False,stop_words='english')
X = vectorizer.fit_transform(speeches_df['content'])
from sklearn.cluster import KMeans
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
Out[120]:
In [121]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
In [ ]:
Which one do you think works the best?
IDF, works the best
I have a scraped collection of Harry Potter fanfiction at https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip.
I want you to read them in, vectorize them and cluster them. Use this process to find out the two types of Harry Potter fanfiction. What is your hypothesis?
In [130]:
paths = glob.glob('hp/*')
In [131]:
paths[0]
Out[131]:
In [136]:
len(paths)
Out[136]:
In [138]:
speeches = []
for path in paths:
with open(path) as speech_file:
speech = {
'pathname': path,
'filename': path.split('/')[-1],
'content': speech_file.read()
}
speeches.append(speech)
hpfanfic_df = pd.DataFrame(speeches)
hpfanfic_df.head()
Out[138]:
In [145]:
hpfanfic_df['content'].head(5)
Out[145]:
In [182]:
vectorizer = TfidfVectorizer(use_idf=True, max_features=10000, stop_words='english')
X = vectorizer.fit_transform(hpfanfic_df['content'])
In [189]:
print(vectorizer.get_feature_names()[:10])
In [186]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
In [187]:
df.head(5)
Out[187]:
In [196]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
In [ ]:
In [ ]:
In [209]:
hpfanfic_df['category'] = km.labels_
In [210]:
hpfanfic_df.head()
Out[210]:
In [203]:
In [ ]: