import pickle,string,time,enum

import numpy,requests,scipy.sparse,regex
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

Data In

Reading in Tweepy data, and turning into a dictionary

with open("search_output-2016-10-16.bin",'rb') as tweet_file:
    results = []
    while not tweet_file.closed:
            results += [pickle.load(tweet_file)]
        except EOFError: tweet_file.close()

Word Frequency

We need to create the most important dimension - the word count.

  1. Scrubbing - remove common words that will almost never be interesting, punctuation and web links
  2. Reducing - removing those words which always occur with other ones
  3. Sorting - selecting the top 100 words which will be used to create the word count dictionary.

headers = results[0]
word_results = [result[1] for result in results[1:]]


Removing common words, links and mentions (words beginning @).

Selecting alpha-numeric words.

exclusion_word_list = "|".join(map(lambda x:"(%ss?)"%x,
                                   {"today","what","where","when","why","about","person","people","man","men","women","link","image","video","pic","picture","gif", #nouns
                                    "she","i","he","it","their","that","you","we","us","your","our","their","this","his","her","my","mine","who","whom", #pronouns
                                    "as","from","here","on","in","by","after","up","with","will","at","out","over","before","into", #prepositions
                                    "could","should","would","are","have","do","has","does","been","be","can","may", #auxilary verbs
                                    "and","as","than","also","but","how", #conjunctions
                                    "an","the","a", #articles
                                    "says","say","take","get","read","watch","like", #verbs
                                    "true","new","not","no","more","all","good","better","best","bad","worse","worst","just","more","most","less","least","first","last", #adjectives

exclusion_patterns = ['@\S+','http','[^a-zA-Z\d\s:]'] #Patterns #,'^\W*$'
exclusion_patterns += ['^(%s)\\b'%exclusion_word_list] # beginning of strings
exclusion_patterns += ['\\b(%s)$'%exclusion_word_list] # end of strings
exclusion_patterns += ['\\b(%s)\\b'%exclusion_word_list] # mid-strings

selection_patterns = [r'\b(\w+)\b'] #only selecting words

word_dict = {}
result_bags = [] #sets of words in the same tweet - this is used below when looking for co-occuring words

for i,r in enumerate(word_results):
    #if(100<i<110): print(r)
    temp_words = r.split()
    result_bag = set()
    for w in temp_words:
        temp_word = w.lower()
        exclusion_matches = [True 
                             for rx in exclusion_patterns 
        if not any(exclusion_matches) and temp_word!="":
            for rx in selection_patterns:
                match =,temp_word)
            if(match): temp_word = match.groups()[0]
            if temp_word not in word_dict: word_dict[temp_word] = 0
            word_dict[temp_word] += 1
            result_bag ^= {temp_word}
            #if(100<i<110): print(temp_word)
    if(result_bag): result_bags += [result_bag]


Reducing in this context means removing any words that are substrings of other words, and removing the unique entry and adding the substring word's count to the superstring's entry.

# definitive ordering of word list
word_order_dict = {}
word_list = numpy.array(sorted(word_dict.keys()))
for i,word in enumerate(word_list): word_order_dict[word] = i

# creating word matrix
word_matrix = numpy.zeros((len(word_dict),len(word_dict)),dtype=numpy.uint)

# counting the number of times each word occurs with each other
for n,rb in enumerate(result_bags):
    #if(100<n<110): print(rb)
    for word in rb:
        i = word_order_dict[word]
        for word2 in rb:
            j = word_order_dict[word2]
            if(i != j): word_matrix[i][j] += 1
#making co-occurance sparse
#word_matrix = scipy.sparse.dok_matrix(word_matrix)

Grouping those which almost always occur together

#Selecting those words which have co-occurances above a threshold
maxes_over_threshold = numpy.array(numpy.max(word_matrix,axis=1)>THRESHOLD).reshape(-1) #word_matrix.todense()

for word in word_list[maxes_over_threshold]:
    row = word_matrix[word_order_dict[word]] #.todense()
    maxes = numpy.array(row).reshape(-1) == row.max()
    partner_words = word_list[maxes]
    if(word in word_dict):
        maxes = numpy.array(row).reshape(-1) == row.max()
        partner_words = word_list[maxes]
        co_occurance_freq = row.max()/word_dict[word]
        if(co_occurance_freq > COOCCURANCE_FREQ_THRESHOLD): 
            #for w in [word]+partner_words.tolist(): print(w,word_dict[w])
            print("Removing {} in lieu of {}".format(word,partner_words))
            del word_dict[word]


word_array = numpy.array([(word_order_dict[w],word_dict[w]) for w in word_dict])
word_array = word_array[numpy.argsort(word_array[:,1])]

word_fields = set()
for w in word_array[-100:]:
    word = word_list[w[0]]
    word_fields |= {word}

clients = set()
languages = set()
for result in results[1:]:
    clients |= {result[-1]}
    languages |= {result[-2]}
Languages = enum.Enum("Language",list(languages))
Clients = enum.Enum("Client",list(clients))

results_list = []
for result in results[1:]:
    result_row = [
        result[2], #Lat
        result[3], #Long
        result[4], #Time
        result[5], #Favourites
        result[6], #Retweets
    word_field_result = [word in result[1] for word in word_fields]
    if any(word_field_result): 
        result_row += word_field_result
        results_list += [result_row]
result_array = numpy.array(results_list)

Relevance Scoring

Using Google News to give a relevance score for Tweet. Google do not seem to like this.

text_pattern = 'id="resultStats">(.*)results'
text_regex = re.compile(text_pattern)

number_pattern = "(\d{1,4}(,\d{3})*)(\.\d+)?"
number_regex = re.compile(number_pattern)

for word in word_list[-100:]:
    search_url = ['',word[0],
    result = requests.get("".join(search_url))
    result_text =[0]
    number_results = float([0].replace(",",""))

Using SKLearn

import sklearn.feature_extraction.text

count_vect = sklearn.feature_extraction.text.CountVectorizer()

bag_of_words = count_vect.transform(results)

