In [ ]:
import pickle,string,time,enum
In [ ]:
import numpy,requests,scipy.sparse,regex
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
Reading in Tweepy data, and turning into a dictionary
In [ ]:
with open("search_output-2016-10-16.bin",'rb') as tweet_file:
results = []
while not tweet_file.closed:
try:
results += [pickle.load(tweet_file)]
except EOFError: tweet_file.close()
We need to create the most important dimension - the word count.
In [ ]:
headers = results[0]
word_results = [result[1] for result in results[1:]]
Removing common words, links and mentions (words beginning @
).
Selecting alpha-numeric words.
In [ ]:
exclusion_word_list = "|".join(map(lambda x:"(%ss?)"%x,
{"today","what","where","when","why","about","person","people","man","men","women","link","image","video","pic","picture","gif", #nouns
"she","i","he","it","their","that","you","we","us","your","our","their","this","his","her","my","mine","who","whom", #pronouns
"as","from","here","on","in","by","after","up","with","will","at","out","over","before","into", #prepositions
"could","should","would","are","have","do","has","does","been","be","can","may", #auxilary verbs
"is","was","will","are","be","to",
"and","as","than","also","but","how", #conjunctions
"an","the","a", #articles
"bye","if","for","of",
"now",#adverbs
"says","say","take","get","read","watch","like", #verbs
"true","new","not","no","more","all","good","better","best","bad","worse","worst","just","more","most","less","least","first","last", #adjectives
}))
exclusion_patterns = ['@\S+','http','[^a-zA-Z\d\s:]'] #Patterns #,'^\W*$'
exclusion_patterns += ['^(%s)\\b'%exclusion_word_list] # beginning of strings
exclusion_patterns += ['\\b(%s)$'%exclusion_word_list] # end of strings
exclusion_patterns += ['\\b(%s)\\b'%exclusion_word_list] # mid-strings
selection_patterns = [r'\b(\w+)\b'] #only selecting words
In [ ]:
word_dict = {}
result_bags = [] #sets of words in the same tweet - this is used below when looking for co-occuring words
for i,r in enumerate(word_results):
#if(100<i<110): print(r)
temp_words = r.split()
result_bag = set()
for w in temp_words:
temp_word = w.lower()
exclusion_matches = [True
for rx in exclusion_patterns
if regex.search(rx,temp_word)]
if not any(exclusion_matches) and temp_word!="":
for rx in selection_patterns:
match = regex.search(rx,temp_word)
if(match): temp_word = match.groups()[0]
if temp_word not in word_dict: word_dict[temp_word] = 0
word_dict[temp_word] += 1
result_bag ^= {temp_word}
#if(100<i<110): print(temp_word)
if(result_bag): result_bags += [result_bag]
#if(100<i<110):print("\n")
In [ ]:
# definitive ordering of word list
word_order_dict = {}
word_list = numpy.array(sorted(word_dict.keys()))
for i,word in enumerate(word_list): word_order_dict[word] = i
# creating word matrix
word_matrix = numpy.zeros((len(word_dict),len(word_dict)),dtype=numpy.uint)
# counting the number of times each word occurs with each other
for n,rb in enumerate(result_bags):
#if(100<n<110): print(rb)
for word in rb:
i = word_order_dict[word]
for word2 in rb:
j = word_order_dict[word2]
if(i != j): word_matrix[i][j] += 1
#making co-occurance sparse
#word_matrix = scipy.sparse.dok_matrix(word_matrix)
Grouping those which almost always occur together
In [ ]:
#Selecting those words which have co-occurances above a threshold
THRESHOLD = 10
maxes_over_threshold = numpy.array(numpy.max(word_matrix,axis=1)>THRESHOLD).reshape(-1) #word_matrix.todense()
COOCCURANCE_FREQ_THRESHOLD = 0.75
for word in word_list[maxes_over_threshold]:
row = word_matrix[word_order_dict[word]] #.todense()
maxes = numpy.array(row).reshape(-1) == row.max()
partner_words = word_list[maxes]
if(word in word_dict):
maxes = numpy.array(row).reshape(-1) == row.max()
partner_words = word_list[maxes]
co_occurance_freq = row.max()/word_dict[word]
if(co_occurance_freq > COOCCURANCE_FREQ_THRESHOLD):
#for w in [word]+partner_words.tolist(): print(w,word_dict[w])
print("Removing {} in lieu of {}".format(word,partner_words))
del word_dict[word]
In [ ]:
word_array = numpy.array([(word_order_dict[w],word_dict[w]) for w in word_dict])
word_array = word_array[numpy.argsort(word_array[:,1])]
In [ ]:
WORD_FIELDS = 100
word_fields = set()
for w in word_array[-100:]:
word = word_list[w[0]]
word_fields |= {word}
In [ ]:
print(word_fields)
In [ ]:
print(headers)
In [ ]:
clients = set()
languages = set()
for result in results[1:]:
clients |= {result[-1]}
languages |= {result[-2]}
Languages = enum.Enum("Language",list(languages))
Clients = enum.Enum("Client",list(clients))
In [ ]:
results_list = []
for result in results[1:]:
result_row = [
result[2], #Lat
result[3], #Long
result[4], #Time
result[5], #Favourites
result[6], #Retweets
Languages[result[7]],
Clients[result[8]]
]
word_field_result = [word in result[1] for word in word_fields]
if any(word_field_result):
result_row += word_field_result
results_list += [result_row]
result_array = numpy.array(results_list)
Using Google News to give a relevance score for Tweet. Google do not seem to like this.
In [ ]:
text_pattern = 'id="resultStats">(.*)results'
text_regex = re.compile(text_pattern)
number_pattern = "(\d{1,4}(,\d{3})*)(\.\d+)?"
number_regex = re.compile(number_pattern)
In [ ]:
for word in word_list[-100:]:
time.sleep(1)
search_url = ['https://www.google.com/search?q=',word[0],
'&safe=off&hl=en&gl=za&source=lnt',
'&tbs=cdr%%3A1',
'%%2Ccd_min%%3A2016%%2F10%%2F07'
'%%2Ccd_max%%3A2016%%2F10%%2F07',
'&tbm=nws']
result = requests.get("".join(search_url))
result_text = text_regex.search(result.text).groups()[0]
number_results = float(number_regex.search(result_text).groups()[0].replace(",",""))
print(word,number)
In [ ]:
import sklearn.feature_extraction.text
In [ ]:
count_vect = sklearn.feature_extraction.text.CountVectorizer()
In [ ]:
bag_of_words = count_vect.transform(results)
In [ ]:
In [ ]:
In [ ]: