In [1]:
%run helper_functions.py
%run df_functions.py
import string
import nltk
import spacy
nlp = spacy.load('en')
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.cluster import KMeans

So far, we have two databases:

  1. 2nd degree connection database where all handles have valid LDA Analysis.

  2. A database with my tweets and associated LDA analysis.

The LDA method was quite powerful for potential followers, distilling down their entire corpus to a few key terms.

Let's now do some TF-IDF and KMeans clustering to see if we find similar results to LDA.

In fact, later in the notebook, I will take the intersection of the LDA Analysis results and TF-IDF results. This intersection will represent words/topics that were picked up by BOTH models for a particular handle's tweets. This will give us the most robust results!


In [2]:
gabr_tweets = unpickle_object("gabr_ibrahim_tweets_LDA_Complete.pkl")

In [3]:
gabr_tweets[0]['gabr_ibrahim'].keys() #just to refresh our mind of the keys in the sub-dictionary


Out[3]:
dict_keys(['retweet_count', 'hashtags', 'favorite_count', 'tokenized_tweets', 'content', 'LDA'])

I will now create a TF-IDF model for my tweets.

Using K-Means Clustering with TF-IDF, I will cluster my tweet's into 20 centroids. From each of these centroids, I will extract 20 words. These words will be placed in a counter dictionary.

TF-IDF KMeans - segemented by individual tweet!

I will make use of spacy again in order to ensure we are giving the 'purest' for of our tweets to the tf-idf vectorizer.

You will see two lists below relating to vocabulary. I will use these lists later to create a usefull dictionary that will help identify particular words within a centroid by index!


In [4]:
temp_gabr_df = pd.DataFrame.from_dict(gabr_tweets[0], orient="index")

In [5]:
temp_gabr_df = filtration(temp_gabr_df, "content")

In [6]:
gabr_tweets_filtered_1 = dataframe_to_dict(temp_gabr_df)

In [7]:
clean_tweet_list = []
totalvocab_tokenized = []
totalvocab_stemmed = []


for tweet in gabr_tweets_filtered_1[0]['gabr_ibrahim']['content']:
    clean_tweet = ""
    to_process = nlp(tweet)
    
    for token in to_process:
        if token.is_space:
            continue
        elif token.is_punct:
            continue
        elif token.is_stop:
            continue
        elif token.is_digit:
            continue
        elif len(token) == 1:
            continue
        elif len(token) == 2:
            continue
        else:
            clean_tweet += str(token.lemma_) + ' '
            totalvocab_tokenized.append(str(token.lemma_))
            totalvocab_stemmed.append(str(token.lemma_))
            
    clean_tweet_list.append(clean_tweet)

In [8]:
#just going to add this to the dictionary so we can do the second round of filtration
gabr_tweets_filtered_1[0]['gabr_ibrahim']['temp_tfidf'] = clean_tweet_list

In [9]:
temp_gabr_df = pd.DataFrame.from_dict(gabr_tweets_filtered_1[0], orient='index')

In [10]:
temp_gabr_df = filtration(temp_gabr_df, 'temp_tfidf')

In [11]:
gabr_tweets_filtered_2 = dataframe_to_dict(temp_gabr_df)

In [12]:
clean_tweet_list = gabr_tweets_filtered_2[0]['gabr_ibrahim']["temp_tfidf"]
del gabr_tweets_filtered_2[0]["gabr_ibrahim"]["temp_tfidf"] # we will add back TF-IDF analysis later!

In [13]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('There are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')


There are 12065 items in vocab_frame

In [14]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_features=200000, stop_words='english', ngram_range=(0,2))

tfidf_matrix = tfidf_vectorizer.fit_transform(clean_tweet_list) #fit the vectorizer to synopses

print(tfidf_matrix.shape)


(1790, 12785)

In [15]:
terms = tfidf_vectorizer.get_feature_names()

In [16]:
num_clusters = 20

km = KMeans(n_clusters=num_clusters, n_jobs=-1, random_state=200)

km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [17]:
cluster_dict = dict()
for i in range(num_clusters):
    for ind in order_centroids[i, :20]: #replace 6 with n words per cluster
        word = str(vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0])
        if i not in cluster_dict:
            cluster_dict[i] = [word]
        else:
            cluster_dict[i].append(word)

In [18]:
cluster_dict.keys() #here we see all 20 clusters.


Out[18]:
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

In [19]:
cluster_dict[0] #words in cluster 1


Out[19]:
['nan',
 'turn',
 'terrorism',
 'identify',
 'stable',
 'stable',
 'secure',
 'briton',
 'blah',
 'battle',
 'mi5',
 'poll',
 'trump',
 'swamp',
 'turn',
 'islamic',
 'likely',
 'city',
 'presently',
 'identify']

In [20]:
cluster_dict[1] #words in cluster 2


Out[20]:
['nan',
 'report',
 'death',
 'police',
 'follow',
 'blast',
 'toll',
 'death',
 'rise',
 'shoot',
 'turkey',
 'explosion',
 'ankara',
 'war',
 'syria',
 'attack',
 'airport',
 'strike',
 'turkish',
 'syria']

In [21]:
cluster_dict[2] #words in cluster 3


Out[21]:
['nan',
 'vote',
 'yes',
 'scotland',
 'turnout',
 'yes',
 'scotland',
 'vote',
 'vote',
 'remain',
 'remain',
 'vote',
 'result',
 'east',
 'ayrshire',
 'ayrshire',
 'scotland',
 'total',
 'declaration',
 'declaration']

In [22]:
#Now lets make our tfidf Counter!
cluster_values = []

for k, v in cluster_dict.items():
    cluster_values.extend(v)

counter_gabr_tfidf = Counter(cluster_values)

In [23]:
counter_gabr_tfidf


Out[23]:
Counter({'abc': 2,
         'accept': 1,
         'access': 2,
         'accord': 1,
         'aid': 1,
         'airport': 1,
         'allen': 2,
         'amazing': 2,
         'ambush': 1,
         'ankara': 2,
         'announce': 1,
         'archive': 1,
         'area': 1,
         'argue': 1,
         'ashamed': 2,
         'assistance': 2,
         'atrocity': 2,
         'attack': 4,
         'attempt': 1,
         'attorney': 1,
         'ayrshire': 2,
         'battle': 1,
         'bbc': 3,
         'big': 2,
         'blah': 1,
         'blast': 1,
         'blog': 3,
         'boil': 2,
         'bomb': 1,
         'break': 4,
         'brilliant': 2,
         'briton': 1,
         'broxbourne': 1,
         'capital': 1,
         'change': 1,
         'check': 2,
         'chemical': 1,
         'child': 1,
         'city': 1,
         'clinton': 2,
         'collaboration': 1,
         'complete': 1,
         'conda': 1,
         'conley': 1,
         'control': 1,
         'council': 1,
         'counterinsurgency': 1,
         'country': 1,
         'coup': 2,
         'crowd': 1,
         'cut': 1,
         'cyber': 2,
         'datum': 4,
         'day': 2,
         'death': 2,
         'declaration': 2,
         'deep': 2,
         'defend': 2,
         'definition': 1,
         'die': 1,
         'digital': 2,
         'donald': 2,
         'east': 1,
         'egypt': 1,
         'erdogan': 1,
         'everybody': 2,
         'evidence': 1,
         'expert': 1,
         'explosion': 1,
         'expose': 1,
         'eye': 3,
         'facilitator': 1,
         'fact': 2,
         'follow': 1,
         'force': 1,
         'foreign': 1,
         'forever': 1,
         'forward': 1,
         'frog': 1,
         'fuck': 1,
         'general': 3,
         'gift': 1,
         'good': 1,
         'government': 3,
         'great': 1,
         'guess': 1,
         'hacker': 1,
         'head': 1,
         'helicopter': 1,
         'history': 1,
         'house': 1,
         'humanitarian': 2,
         'identify': 2,
         'immigration': 1,
         'incredible': 1,
         'internet': 2,
         'introduction': 2,
         'iranian': 2,
         'isis': 1,
         'islamic': 3,
         'issue': 1,
         'john': 2,
         'kill': 3,
         'know': 5,
         'late': 1,
         'later': 2,
         'law': 1,
         'lawful': 2,
         'learn': 1,
         'learning': 1,
         'leave': 4,
         'lecture': 1,
         'lepage': 1,
         'let': 1,
         'like': 5,
         'likely': 1,
         'live': 1,
         'look': 4,
         'love': 2,
         'machine': 2,
         'mass': 2,
         'mcgill': 2,
         'mean': 1,
         'meet': 1,
         'mi5': 1,
         'military': 1,
         'mom': 2,
         'money': 1,
         'nan': 20,
         'new': 4,
         'news': 3,
         'nicola': 2,
         'nigerian': 1,
         'obama': 1,
         'office': 1,
         'official': 1,
         'online': 2,
         'open': 2,
         'order': 1,
         'panel': 2,
         'paralysis': 2,
         'pay': 1,
         'peel': 1,
         'people': 4,
         'personal': 2,
         'police': 2,
         'political': 1,
         'poll': 2,
         'possibly': 2,
         'post': 1,
         'prepare': 1,
         'presently': 1,
         'president': 1,
         'prevention': 2,
         'problem': 1,
         'program': 1,
         'project': 3,
         'putin': 1,
         'reaction': 1,
         'read': 1,
         'record': 2,
         'remain': 2,
         'renee': 1,
         'report': 2,
         'result': 3,
         'right': 1,
         'rise': 1,
         'run': 1,
         'say': 1,
         'science': 1,
         'scientist': 1,
         'scotland': 4,
         'secure': 1,
         'security': 3,
         'share': 1,
         'shit': 1,
         'shoot': 1,
         'silence': 1,
         'simply': 1,
         'snp': 2,
         'social': 1,
         'soldier': 2,
         'special': 2,
         'stable': 2,
         'state': 2,
         'strike': 1,
         'strong': 1,
         'student': 1,
         'sturgeon': 1,
         'sunderland': 1,
         'support': 1,
         'surveillance': 2,
         'swamp': 1,
         'sydney': 1,
         'syria': 5,
         'taliban': 2,
         'talk': 1,
         'tax': 2,
         'team': 1,
         'tell': 4,
         'terrorism': 1,
         'terrorist': 1,
         'thank': 2,
         'thing': 1,
         'time': 2,
         'today': 1,
         'toll': 1,
         'total': 1,
         'tragedy': 2,
         'training': 1,
         'troop': 1,
         'trump': 3,
         'try': 1,
         'tupac': 1,
         'turkey': 2,
         'turkish': 3,
         'turn': 2,
         'turnout': 1,
         'ukraine': 1,
         'ukrainian': 2,
         'united': 2,
         'upcoming': 1,
         'video': 1,
         'view': 2,
         'vote': 6,
         'want': 2,
         'war': 1,
         'warn': 1,
         'way': 1,
         'webcast': 2,
         'westminster': 2,
         'white': 1,
         'work': 3,
         'world': 1,
         'write': 1,
         'yay': 2,
         'yes': 2})

In [24]:
gabr_tweets_filtered_2[0]['gabr_ibrahim']["tfid_counter"] = counter_gabr_tfidf

In [25]:
gabr_tfidf_counter = gabr_tweets_filtered_2[0]['gabr_ibrahim']["tfid_counter"]

gabr_lda_counter = gabr_tweets_filtered_2[0]['gabr_ibrahim']["LDA"]

In [26]:
gabr_tfidf_set = set()
gabr_lda_set = set()

for key, value in gabr_tfidf_counter.items():
    gabr_tfidf_set.add(key)

for key, value in gabr_lda_counter.items():
    gabr_lda_set.add(key)

In [27]:
intersection = gabr_tfidf_set.intersection(gabr_lda_set)

In [28]:
gabr_tweets_filtered_2[0]['gabr_ibrahim']["lda_tfid_intersection"] = intersection

In [29]:
pickle_object(gabr_tweets_filtered_2, "FINAL_GABR_DATABASE_LDA_TFIDF_VERIFIED")

Thats all there is to this process! I will now write a script called kmeans.py that will dyanmically run all the code above for individuals in final_database_LDA_verified.pkl.


In [ ]: