In [ ]:

    
import pickle,string,time,enum



In [ ]:

    
import numpy,requests,scipy.sparse,regex
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

Data In

Reading in Tweepy data, and turning into a dictionary



In [ ]:

    
with open("search_output-2016-10-16.bin",'rb') as tweet_file:
    results = []
    while not tweet_file.closed:
        try: 
            results += [pickle.load(tweet_file)]
        
        except EOFError: tweet_file.close()

Word Frequency

We need to create the most important dimension - the word count.

Scrubbing - remove common words that will almost never be interesting, punctuation and web links
Reducing - removing those words which always occur with other ones
Sorting - selecting the top 100 words which will be used to create the word count dictionary.



In [ ]:

    
headers = results[0]
word_results = [result[1] for result in results[1:]]

Scrubbing

Removing common words, links and mentions (words beginning @).

Selecting alpha-numeric words.



In [ ]:

    
exclusion_word_list = "|".join(map(lambda x:"(%ss?)"%x,
                                   {"today","what","where","when","why","about","person","people","man","men","women","link","image","video","pic","picture","gif", #nouns
                                    "she","i","he","it","their","that","you","we","us","your","our","their","this","his","her","my","mine","who","whom", #pronouns
                                    "as","from","here","on","in","by","after","up","with","will","at","out","over","before","into", #prepositions
                                    "could","should","would","are","have","do","has","does","been","be","can","may", #auxilary verbs
                                    "is","was","will","are","be","to",
                                    "and","as","than","also","but","how", #conjunctions
                                    "an","the","a", #articles
                                    "bye","if","for","of",
                                    "now",#adverbs
                                    "says","say","take","get","read","watch","like", #verbs
                                    "true","new","not","no","more","all","good","better","best","bad","worse","worst","just","more","most","less","least","first","last", #adjectives
                                   })) 

exclusion_patterns = ['@\S+','http','[^a-zA-Z\d\s:]'] #Patterns #,'^\W*$'
exclusion_patterns += ['^(%s)\\b'%exclusion_word_list] # beginning of strings
exclusion_patterns += ['\\b(%s)$'%exclusion_word_list] # end of strings
exclusion_patterns += ['\\b(%s)\\b'%exclusion_word_list] # mid-strings

selection_patterns = [r'\b(\w+)\b'] #only selecting words



In [ ]:

    
word_dict = {}
result_bags = [] #sets of words in the same tweet - this is used below when looking for co-occuring words

for i,r in enumerate(word_results):
    #if(100<i<110): print(r)
    temp_words = r.split()
    result_bag = set()
    
    for w in temp_words:
        temp_word = w.lower()
        
        exclusion_matches = [True 
                             for rx in exclusion_patterns 
                             if regex.search(rx,temp_word)]
        if not any(exclusion_matches) and temp_word!="":
            
            for rx in selection_patterns:
                match = regex.search(rx,temp_word)
                
            if(match): temp_word = match.groups()[0]
                    
            if temp_word not in word_dict: word_dict[temp_word] = 0
            word_dict[temp_word] += 1
            result_bag ^= {temp_word}
            #if(100<i<110): print(temp_word)
            
    if(result_bag): result_bags += [result_bag]
                
    #if(100<i<110):print("\n")

Reducing

Reducing in this context means removing any words that are substrings of other words, and removing the unique entry and adding the substring word's count to the superstring's entry.



In [ ]:

    
# definitive ordering of word list
word_order_dict = {}
word_list = numpy.array(sorted(word_dict.keys()))
for i,word in enumerate(word_list): word_order_dict[word] = i

# creating word matrix
word_matrix = numpy.zeros((len(word_dict),len(word_dict)),dtype=numpy.uint)

# counting the number of times each word occurs with each other
for n,rb in enumerate(result_bags):
    #if(100<n<110): print(rb)
    for word in rb:
        i = word_order_dict[word]
        for word2 in rb:
            j = word_order_dict[word2]
            if(i != j): word_matrix[i][j] += 1
                
#making co-occurance sparse
#word_matrix = scipy.sparse.dok_matrix(word_matrix)

Grouping those which almost always occur together



In [ ]:

    
#Selecting those words which have co-occurances above a threshold
THRESHOLD = 10
maxes_over_threshold = numpy.array(numpy.max(word_matrix,axis=1)>THRESHOLD).reshape(-1) #word_matrix.todense()

COOCCURANCE_FREQ_THRESHOLD = 0.75
for word in word_list[maxes_over_threshold]:
    row = word_matrix[word_order_dict[word]] #.todense()
        
    maxes = numpy.array(row).reshape(-1) == row.max()
    partner_words = word_list[maxes]
    
    if(word in word_dict):
        maxes = numpy.array(row).reshape(-1) == row.max()
        partner_words = word_list[maxes]
        co_occurance_freq = row.max()/word_dict[word]
        
        if(co_occurance_freq > COOCCURANCE_FREQ_THRESHOLD): 
            #for w in [word]+partner_words.tolist(): print(w,word_dict[w])
            print("Removing {} in lieu of {}".format(word,partner_words))
            del word_dict[word]

Selecting



In [ ]:

    
word_array = numpy.array([(word_order_dict[w],word_dict[w]) for w in word_dict])
word_array = word_array[numpy.argsort(word_array[:,1])]



In [ ]:

    
WORD_FIELDS = 100
word_fields = set()
for w in word_array[-100:]:
    word = word_list[w[0]]
    word_fields |= {word}



In [ ]:

    
print(word_fields)



In [ ]:

    
print(headers)



In [ ]:

    
clients = set()
languages = set()
for result in results[1:]:
    clients |= {result[-1]}
    languages |= {result[-2]}
    
Languages = enum.Enum("Language",list(languages))
Clients = enum.Enum("Client",list(clients))



In [ ]:

    
results_list = []
for result in results[1:]:
    
    result_row = [
        result[2], #Lat
        result[3], #Long
        result[4], #Time
        result[5], #Favourites
        result[6], #Retweets
        Languages[result[7]],
        Clients[result[8]]
                 ]
    
    word_field_result = [word in result[1] for word in word_fields]
    if any(word_field_result): 
        result_row += word_field_result
        results_list += [result_row]
        
result_array = numpy.array(results_list)

Relevance Scoring

Using Google News to give a relevance score for Tweet. Google do not seem to like this.



In [ ]:

    
text_pattern = 'id="resultStats">(.*)results'
text_regex = re.compile(text_pattern)

number_pattern = "(\d{1,4}(,\d{3})*)(\.\d+)?"
number_regex = re.compile(number_pattern)



In [ ]:

    
for word in word_list[-100:]:
    time.sleep(1)
    
    search_url = ['https://www.google.com/search?q=',word[0],
                  '&safe=off&hl=en&gl=za&source=lnt',
                  '&tbs=cdr%%3A1',
                  '%%2Ccd_min%%3A2016%%2F10%%2F07'
                  '%%2Ccd_max%%3A2016%%2F10%%2F07',
                  '&tbm=nws']
    result = requests.get("".join(search_url))
    
    result_text = text_regex.search(result.text).groups()[0]
    number_results = float(number_regex.search(result_text).groups()[0].replace(",",""))
    print(word,number)

Using SKLearn



In [ ]:

    
import sklearn.feature_extraction.text



In [ ]:

    
count_vect = sklearn.feature_extraction.text.CountVectorizer()



In [ ]:

    
bag_of_words = count_vect.transform(results)



In [ ]:



In [ ]:



In [ ]: