In [2]:
# Import all of the things you need to import!
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import KMeans
pd.options.display.max_columns = 30
%matplotlib inline
The Congressional Record is more or less what happened in Congress every single day. Speeches and all that. A good large source of text data, maybe?
Let's pretend it's totally secret but we just got it leaked to us in a data dump, and we need to check it out. It was leaked from this page here.
In [3]:
# If you'd like to download it through the command line...
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
In [4]:
# And then extract it through the command line...
!tar -zxf convote_v1.1.tar.gz
You can explore the files if you'd like, but we're going to get the ones from convote_v1.1/data_stage_one/development_set/
. It's a bunch of text files.
In [5]:
# glob finds files matching a certain filename pattern
import glob
# Give me all the text files
paths = glob.glob('convote_v1.1/data_stage_one/development_set/*')
paths[:5]
Out[5]:
In [6]:
len(paths)
Out[6]:
So great, we have 702 of them. Now let's import them.
In [7]:
speeches = []
for path in paths:
with open(path) as speech_file:
speech = {
'pathname': path,
'filename': path.split('/')[-1],
'content': speech_file.read()
}
speeches.append(speech)
speeches_df = pd.DataFrame(speeches)
speeches_df.head()
Out[7]:
In class we had the texts
variable. For the homework can just do speeches_df['content']
to get the same sort of list of stuff.
Take a look at the contents of the first 5 speeches
In [8]:
for item in speeches_df['content'][:5]:
print(item[:140], "\n")
In [9]:
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(speeches_df['content'])
X
Out[9]:
In [10]:
X_df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
In [11]:
X_df.head(10)
Out[11]:
Okay, it's far too big to even look at. Let's try to get a list of features from a new CountVectorizer
that only takes the top 100 words.
In [12]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = count_vectorizer.fit_transform(speeches_df['content'])
X
Out[12]:
In [13]:
pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names()).head()
Out[13]:
Now let's push all of that into a dataframe with nicely named columns.
In [14]:
X_df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
Everyone seems to start their speeches with "mr chairman" - how many speeches are there total, and many don't mention "chairman" and how many mention neither "mr" nor "chairman"?
In [15]:
no_chairman = X_df[X_df['chairman'] == 0]['chairman'].count()
no_chairman_no_mr = X_df[(X_df['chairman'] == 0) & (X_df['mr'] == 0)]['chairman'].count()
print("In a total of", len(X_df), "speeches,", no_chairman, "don't mention “chairman” and", no_chairman_no_mr, "mention neither “mr” nor “chairman”.")
What is the index of the speech thank is the most thankful, a.k.a. includes the word 'thank' the most times?
In [16]:
print("The index of this speech is", X_df['thank'].idxmax())
If I'm searching for China
and trade
, what are the top 3 speeches to read according to the CountVectoriser
?
In [17]:
china_trade = X_df.sort_values(by=['china', 'trade'], ascending=[0, 0])[['china', 'trade']].head(3)
print("These three speeches have the indexes ", *list(china_trade.index))
china_trade
Out[17]:
Now what if I'm using a TfidfVectorizer
?
In [18]:
def simple_tokenizer(str_input):
words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
return words
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=simple_tokenizer, use_idf=False, norm='l1')
X = tfidf_vectorizer.fit_transform(speeches_df['content'])
TF_pd = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
china_trade = TF_pd.sort_values(by=['china', 'trade'], ascending=[0, 0])[['china', 'trade']].head(3)
print("The three top speeches have the indexes ", *list(china_trade.index))
china_trade
Out[18]:
What's the content of the speeches? Here's a way to get them:
In [19]:
# index 0 is the first speech, which was the first one imported.
paths[0]
Out[19]:
In [31]:
# Pass that into 'cat' using { } which lets you put variables in shell commands
# that way you can pass the path to cat
!cat {paths[0]}
Now search for something else! Another two terms that might show up. elections
and chaos
? Whatever you thnik might be interesting.
In [35]:
numbers = list(range(0, 10))
numbers = list(map(str, numbers))
words_list = [i for i in list(TF_pd.columns) if i[0] not in numbers]
print(*words_list[5:100], sep='|') # to get some ideas
In [22]:
chaos = TF_pd.sort_values(by=['awfully', 'bacterial'], ascending=[0, 0])[['awfully', 'bacterial']].head(3)
print("The three top speeches have the indexes ", *list(chaos.index))
chaos
Out[22]:
In [23]:
gun_bomb = TF_pd.sort_values(by=['gun', 'bomb'], ascending=[0, 0])[['gun', 'bomb']].head(3)
print("The three top speeches have the indexes ", *list(gun_bomb.index))
gun_bomb
Out[23]:
Using a simple counting vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
Using a term frequency vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
Using a term frequency inverse document frequency vectorizer, cluster the documents into eight categories, telling me what the top terms are per category.
CountVectorizer()
: Convert a collection of text documents to a matrix of token counts
TfidfVectorizer(use_idf=False)
: Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer.
TfidfVectorizer(use_idf=True)
(default): Enable inverse-document-frequency reweighting.
In [24]:
countingVectorizer = CountVectorizer(tokenizer=simple_tokenizer, stop_words='english')
TF_Vectorizer = TfidfVectorizer(use_idf=False, tokenizer=simple_tokenizer, stop_words='english')
TF_IDF_Vectorizer = TfidfVectorizer(use_idf=True, tokenizer=simple_tokenizer, stop_words='english')
Vectorizer_list = [countingVectorizer, TF_Vectorizer, TF_IDF_Vectorizer]
Vectorizer_names = ['', 'simple counting vectorizer', 'term frequency vectorizer', 'term frequency IDF vectorizer']
In [25]:
count = 1
for vectorizer in Vectorizer_list:
print("\n[" + str(count) + "]", Vectorizer_names[count])
X = vectorizer.fit_transform(speeches_df['content'])
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_five_words = [terms[ind] for ind in order_centroids[i, :10]]
print("Cluster {}: {}".format(i, ' '.join(top_five_words)))
count += 1
Which one do you think works the best?
The TF-IDF is definitely the most efficient one, at least in this case!
I have a scraped collection of Harry Potter fanfiction at https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip.
I want you to read them in, vectorize them and cluster them. Use this process to find out the two types of Harry Potter fanfiction. What is your hypothesis?
curl -LO
In [26]:
!curl -LO https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip
In [27]:
#!unzip hp.zip
paths_potter = glob.glob('hp/*')
paths_potter[:3]
Out[27]:
In [28]:
potter_texts = []
for path in paths_potter:
with open(path) as speech_file:
text = {
'pathname': path,
'filename': path.split('/')[-1],
'content': speech_file.read()
}
potter_texts.append(text)
potter_df = pd.DataFrame(potter_texts)
potter_df.head(2)
Out[28]:
In [29]:
#1
vectorizer = TfidfVectorizer(use_idf=True, tokenizer=simple_tokenizer, stop_words='english')
X = vectorizer.fit_transform(potter_df['content'])
#2
number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
#3
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))
#4
results = pd.DataFrame()
results['text'] = potter_df['content']
results['category'] = km.labels_
results.head(10)
Out[29]: