In [1]:
%run helper_functions.py
%run df_functions.py
import string
import nltk
import spacy
nlp = spacy.load('en')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.cluster import KMeans
So far, we have two databases:
2nd degree connection database where all handles have valid LDA Analysis.
A database with my tweets and associated LDA analysis.
The LDA method was quite powerful for potential followers, distilling down their entire corpus to a few key terms.
Let's now do some TF-IDF and KMeans clustering to see if we find similar results to LDA.
In fact, later in the notebook, I will take the intersection of the LDA Analysis results and TF-IDF results. This intersection will represent words/topics that were picked up by BOTH models for a particular handle's tweets. This will give us the most robust results!
In [2]:
gabr_tweets = unpickle_object("gabr_ibrahim_tweets_LDA_Complete.pkl")
In [3]:
gabr_tweets[0]['gabr_ibrahim'].keys() #just to refresh our mind of the keys in the sub-dictionary
Out[3]:
I will now create a TF-IDF model for my tweets.
Using K-Means Clustering with TF-IDF, I will cluster my tweet's into 20 centroids. From each of these centroids, I will extract 20 words. These words will be placed in a counter dictionary.
I will make use of spacy again in order to ensure we are giving the 'purest' for of our tweets to the tf-idf
vectorizer.
You will see two lists below relating to vocabulary. I will use these lists later to create a usefull dictionary that will help identify particular words within a centroid by index!
In [4]:
temp_gabr_df = pd.DataFrame.from_dict(gabr_tweets[0], orient="index")
In [5]:
temp_gabr_df = filtration(temp_gabr_df, "content")
In [6]:
gabr_tweets_filtered_1 = dataframe_to_dict(temp_gabr_df)
In [7]:
clean_tweet_list = []
totalvocab_tokenized = []
totalvocab_stemmed = []
for tweet in gabr_tweets_filtered_1[0]['gabr_ibrahim']['content']:
clean_tweet = ""
to_process = nlp(tweet)
for token in to_process:
if token.is_space:
continue
elif token.is_punct:
continue
elif token.is_stop:
continue
elif token.is_digit:
continue
elif len(token) == 1:
continue
elif len(token) == 2:
continue
else:
clean_tweet += str(token.lemma_) + ' '
totalvocab_tokenized.append(str(token.lemma_))
totalvocab_stemmed.append(str(token.lemma_))
clean_tweet_list.append(clean_tweet)
In [8]:
#just going to add this to the dictionary so we can do the second round of filtration
gabr_tweets_filtered_1[0]['gabr_ibrahim']['temp_tfidf'] = clean_tweet_list
In [9]:
temp_gabr_df = pd.DataFrame.from_dict(gabr_tweets_filtered_1[0], orient='index')
In [10]:
temp_gabr_df = filtration(temp_gabr_df, 'temp_tfidf')
In [11]:
gabr_tweets_filtered_2 = dataframe_to_dict(temp_gabr_df)
In [12]:
clean_tweet_list = gabr_tweets_filtered_2[0]['gabr_ibrahim']["temp_tfidf"]
del gabr_tweets_filtered_2[0]["gabr_ibrahim"]["temp_tfidf"] # we will add back TF-IDF analysis later!
In [13]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('There are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')
In [14]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_features=200000, stop_words='english', ngram_range=(0,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(clean_tweet_list) #fit the vectorizer to synopses
print(tfidf_matrix.shape)
In [15]:
terms = tfidf_vectorizer.get_feature_names()
In [16]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters, n_jobs=-1, random_state=200)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
In [17]:
cluster_dict = dict()
for i in range(num_clusters):
for ind in order_centroids[i, :20]: #replace 6 with n words per cluster
word = str(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])
if i not in cluster_dict:
cluster_dict[i] = [word]
else:
cluster_dict[i].append(word)
In [18]:
cluster_dict.keys() #here we see all 20 clusters.
Out[18]:
In [19]:
cluster_dict[0] #words in cluster 1
Out[19]:
In [20]:
cluster_dict[1] #words in cluster 2
Out[20]:
In [21]:
cluster_dict[2] #words in cluster 3
Out[21]:
In [22]:
#Now lets make our tfidf Counter!
cluster_values = []
for k, v in cluster_dict.items():
cluster_values.extend(v)
counter_gabr_tfidf = Counter(cluster_values)
In [23]:
counter_gabr_tfidf
Out[23]:
In [24]:
gabr_tweets_filtered_2[0]['gabr_ibrahim']["tfid_counter"] = counter_gabr_tfidf
In [25]:
gabr_tfidf_counter = gabr_tweets_filtered_2[0]['gabr_ibrahim']["tfid_counter"]
gabr_lda_counter = gabr_tweets_filtered_2[0]['gabr_ibrahim']["LDA"]
In [26]:
gabr_tfidf_set = set()
gabr_lda_set = set()
for key, value in gabr_tfidf_counter.items():
gabr_tfidf_set.add(key)
for key, value in gabr_lda_counter.items():
gabr_lda_set.add(key)
In [27]:
intersection = gabr_tfidf_set.intersection(gabr_lda_set)
In [28]:
gabr_tweets_filtered_2[0]['gabr_ibrahim']["lda_tfid_intersection"] = intersection
In [29]:
pickle_object(gabr_tweets_filtered_2, "FINAL_GABR_DATABASE_LDA_TFIDF_VERIFIED")
Thats all there is to this process! I will now write a script called kmeans.py
that will dyanmically run all the code above for individuals in final_database_LDA_verified.pkl
.
In [ ]: