In [ ]:
import pickle,string,time,enum

In [ ]:
import numpy,requests,scipy.sparse,regex
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

Data In

Reading in Tweepy data, and turning into a dictionary


In [ ]:
with open("search_output-2016-10-16.bin",'rb') as tweet_file:
    results = []
    while not tweet_file.closed:
        try: 
            results += [pickle.load(tweet_file)]
        
        except EOFError: tweet_file.close()

Word Frequency

We need to create the most important dimension - the word count.

  1. Scrubbing - remove common words that will almost never be interesting, punctuation and web links
  2. Reducing - removing those words which always occur with other ones
  3. Sorting - selecting the top 100 words which will be used to create the word count dictionary.

In [ ]:
headers = results[0]
word_results = [result[1] for result in results[1:]]

Scrubbing

Removing common words, links and mentions (words beginning @).

Selecting alpha-numeric words.


In [ ]:
exclusion_word_list = "|".join(map(lambda x:"(%ss?)"%x,
                                   {"today","what","where","when","why","about","person","people","man","men","women","link","image","video","pic","picture","gif", #nouns
                                    "she","i","he","it","their","that","you","we","us","your","our","their","this","his","her","my","mine","who","whom", #pronouns
                                    "as","from","here","on","in","by","after","up","with","will","at","out","over","before","into", #prepositions
                                    "could","should","would","are","have","do","has","does","been","be","can","may", #auxilary verbs
                                    "is","was","will","are","be","to",
                                    "and","as","than","also","but","how", #conjunctions
                                    "an","the","a", #articles
                                    "bye","if","for","of",
                                    "now",#adverbs
                                    "says","say","take","get","read","watch","like", #verbs
                                    "true","new","not","no","more","all","good","better","best","bad","worse","worst","just","more","most","less","least","first","last", #adjectives
                                   })) 

exclusion_patterns = ['@\S+','http','[^a-zA-Z\d\s:]'] #Patterns #,'^\W*$'
exclusion_patterns += ['^(%s)\\b'%exclusion_word_list] # beginning of strings
exclusion_patterns += ['\\b(%s)$'%exclusion_word_list] # end of strings
exclusion_patterns += ['\\b(%s)\\b'%exclusion_word_list] # mid-strings

selection_patterns = [r'\b(\w+)\b'] #only selecting words

In [ ]:
word_dict = {}
result_bags = [] #sets of words in the same tweet - this is used below when looking for co-occuring words

for i,r in enumerate(word_results):
    #if(100<i<110): print(r)
    temp_words = r.split()
    result_bag = set()
    
    for w in temp_words:
        temp_word = w.lower()
        
        exclusion_matches = [True 
                             for rx in exclusion_patterns 
                             if regex.search(rx,temp_word)]
        if not any(exclusion_matches) and temp_word!="":
            
            for rx in selection_patterns:
                match = regex.search(rx,temp_word)
                
            if(match): temp_word = match.groups()[0]
                    
            if temp_word not in word_dict: word_dict[temp_word] = 0
            word_dict[temp_word] += 1
            result_bag ^= {temp_word}
            #if(100<i<110): print(temp_word)
            
    if(result_bag): result_bags += [result_bag]
                
    #if(100<i<110):print("\n")

Reducing

Reducing in this context means removing any words that are substrings of other words, and removing the unique entry and adding the substring word's count to the superstring's entry.


In [ ]:
# definitive ordering of word list
word_order_dict = {}
word_list = numpy.array(sorted(word_dict.keys()))
for i,word in enumerate(word_list): word_order_dict[word] = i

# creating word matrix
word_matrix = numpy.zeros((len(word_dict),len(word_dict)),dtype=numpy.uint)

# counting the number of times each word occurs with each other
for n,rb in enumerate(result_bags):
    #if(100<n<110): print(rb)
    for word in rb:
        i = word_order_dict[word]
        for word2 in rb:
            j = word_order_dict[word2]
            if(i != j): word_matrix[i][j] += 1
                
#making co-occurance sparse
#word_matrix = scipy.sparse.dok_matrix(word_matrix)

Grouping those which almost always occur together


In [ ]:
#Selecting those words which have co-occurances above a threshold
THRESHOLD = 10
maxes_over_threshold = numpy.array(numpy.max(word_matrix,axis=1)>THRESHOLD).reshape(-1) #word_matrix.todense()

COOCCURANCE_FREQ_THRESHOLD = 0.75
for word in word_list[maxes_over_threshold]:
    row = word_matrix[word_order_dict[word]] #.todense()
        
    maxes = numpy.array(row).reshape(-1) == row.max()
    partner_words = word_list[maxes]
    
    if(word in word_dict):
        maxes = numpy.array(row).reshape(-1) == row.max()
        partner_words = word_list[maxes]
        co_occurance_freq = row.max()/word_dict[word]
        
        if(co_occurance_freq > COOCCURANCE_FREQ_THRESHOLD): 
            #for w in [word]+partner_words.tolist(): print(w,word_dict[w])
            print("Removing {} in lieu of {}".format(word,partner_words))
            del word_dict[word]

Selecting


In [ ]:
word_array = numpy.array([(word_order_dict[w],word_dict[w]) for w in word_dict])
word_array = word_array[numpy.argsort(word_array[:,1])]

In [ ]:
WORD_FIELDS = 100
word_fields = set()
for w in word_array[-100:]:
    word = word_list[w[0]]
    word_fields |= {word}

In [ ]:
print(word_fields)

In [ ]:
print(headers)

In [ ]:
clients = set()
languages = set()
for result in results[1:]:
    clients |= {result[-1]}
    languages |= {result[-2]}
    
Languages = enum.Enum("Language",list(languages))
Clients = enum.Enum("Client",list(clients))

In [ ]:
results_list = []
for result in results[1:]:
    
    result_row = [
        result[2], #Lat
        result[3], #Long
        result[4], #Time
        result[5], #Favourites
        result[6], #Retweets
        Languages[result[7]],
        Clients[result[8]]
                 ]
    
    word_field_result = [word in result[1] for word in word_fields]
    if any(word_field_result): 
        result_row += word_field_result
        results_list += [result_row]
        
result_array = numpy.array(results_list)

Relevance Scoring

Using Google News to give a relevance score for Tweet. Google do not seem to like this.


In [ ]:
text_pattern = 'id="resultStats">(.*)results'
text_regex = re.compile(text_pattern)

number_pattern = "(\d{1,4}(,\d{3})*)(\.\d+)?"
number_regex = re.compile(number_pattern)

In [ ]:
for word in word_list[-100:]:
    time.sleep(1)
    
    search_url = ['https://www.google.com/search?q=',word[0],
                  '&safe=off&hl=en&gl=za&source=lnt',
                  '&tbs=cdr%%3A1',
                  '%%2Ccd_min%%3A2016%%2F10%%2F07'
                  '%%2Ccd_max%%3A2016%%2F10%%2F07',
                  '&tbm=nws']
    result = requests.get("".join(search_url))
    
    result_text = text_regex.search(result.text).groups()[0]
    number_results = float(number_regex.search(result_text).groups()[0].replace(",",""))
    print(word,number)

Using SKLearn


In [ ]:
import sklearn.feature_extraction.text

In [ ]:
count_vect = sklearn.feature_extraction.text.CountVectorizer()

In [ ]:
bag_of_words = count_vect.transform(results)

In [ ]:


In [ ]:


In [ ]: