In [1]:
import timeit
import importlib
import relevancer as rlv
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from bson.objectid import ObjectId
from collections import Counter
import numpy as np
pd.set_option("display.max_colwidth",200)
In [2]:
# this is just to load the latest version of the Relevancer after we edit the code.
importlib.reload(rlv)
active_col = "active_text"
rlv.set_active_column(active_col)
In [3]:
#importlib.reload(rlv)
my_token_pattern=r"[#@]?\w+\b|[\U00010000-\U0010ffff]"
rlvdb, rlvcl = rlv.connect_mongodb(configfile='elifdb.ini',coll_name="testcl")
# set active columns
active_col = "active_text"
rlv.set_active_column(active_col)
In [4]:
importlib.reload(rlv)
# after the first iteration, the annotated clusters should be excluded from the clustering.
# read this file from the tagged collection.
annotated_tw_ids = ['563657483395530753', '563662532326330370', '563654330041909248', '563654944927281152', '563657924233289728', '563661021559390208', '563651950386757632', '563657164317667328', '563660271810383872', '563662538949160960'] #You should get the actual annotated tweet ids from the annotated tweets collection.
#annotated_tw_ids = []
# mongo_query=({'_id': {'$gte': begin, '$lte': end},'lang':'en'})
tweetlist = rlv.read_json_tweet_fields_database(rlvcl, mongo_query=({}), read_fields={'text': 1, 'id_str': 1, '_id': 0, 'user_id': 1}, tweet_count=-1, annotated_ids=annotated_tw_ids)#=tweetsDF)
rlv.logging.info("Number of tweets:" + str(len(tweetlist)))
print("Number of tweets:",len(tweetlist))
In [ ]:
tweetsDF = rlv.create_dataframe(tweetlist)
tweetsDF.head()
In [ ]:
len(tweetsDF)
In [ ]:
#tweetsDF.to_pickle("20151005_tweetsDF_Genocide")
In [ ]:
# Be careful do not overwrite tweetsDFBackUP with a modified tweetsDF
#tweetsDFBackUP = tweetsDF.copy()
#len(tweetsDFBackUP)
In [ ]:
# Get a clean copy
#importlib.reload(rlv)
tweetsDF = tweetsDFBackUP.copy()
len(tweetsDF)
In [ ]:
tweetsDF[active_col] = tweetsDF["text"].copy()
tweetsDF = rlv.tok_results(tweetsDF, elimrt = True)
In [ ]:
len(tweetsDF)
In [ ]:
tweetsDF = rlv.normalize_text(tweetsDF)
len(tweetsDF), tweetsDF.columns
In [ ]:
#importlib.reload(rlv)
#rlv.set_active_column(active_col)
In [ ]:
def eliminate_duplicates_recursively(df, duplicate_elim_func):
"""
The actual near-duplicate detection algorithm is not memory-efficient enough. Therefore,
we mostly need to divide the data in the buckets, eliminate duplicates, merge the data, shuffle it, and repeat
the same cycle, until no-duplicate detected in any bucket. That may take long for big data sets. Conditions can
be relaxed to be quicker but leave a few duplicates.
"""
print("starting, length:",len(df))
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
tmp_df2 = pd.DataFrame()
for i in range(0, len(df), 10000):
tmp_unique = duplicate_elim_func(df[i:i+step], similarity_threshold=0.20, debug=True, debug_threshold=10000)
tmp_df2 = pd.concat([tmp_df2, tmp_unique], ignore_index=True)
if len(df) > len(tmp_df2):
print(str(len(df) - len(tmp_df2))+" tweets were eliminated!")
return eliminate_duplicates_recursively(tmp_df2, duplicate_elim_func)
return df
In [ ]:
tweetsDF_uniq = eliminate_duplicates_recursively(tweetsDF.copy(), rlv.get_and_eliminate_near_duplicate_tweets)
In [ ]:
len(tweetsDF_uniq)
In [ ]:
tweetsDF_uniq.to_pickle("20151005_unique_genocide_tweets.pickle")
In [ ]:
tweetsDF_uniq[["active_text"]][:10]
In [224]:
importlib.reload(rlv)
rlv.set_active_column(active_col)
In [225]:
cluster_list = rlv.create_clusters(tweetsDF_uniq, my_token_pattern, min_dist_thres=0.725, max_dist_thres=0.875, min_max_diff_thres=0.4, nameprefix='1-', min_clusters=100, user_identifier='user_id')
In [226]:
print("Number of clusters:",len(cluster_list))
print("available cluster information:", cluster_list[0].keys())
In [227]:
i = 0
In [237]:
#i = i-2
print("No:",cluster_list[i]['cno'])
print("CStr:",cluster_list[i]['cstr'])
print("Cluster Tuple List:")
print(*[(c[0],c[2]) for c in cluster_list[i]['ctweettuplelist']], sep='\n')
# Add any field you want to observe.
i+=1
In [215]:
print("Reverse Frequency of the terms:")
print(i)
for k, v in Counter({int(k):v for k,v in cluster_list[i-1]['rif'].items()}).items():
if k>1:
print(k,":",v)
In [252]:
rlvdb2, rlvcl2 = rlv.connect_mongodb(configfile='ebasar2.ini',coll_name="testcl")
collection_name = 'genocide_clusters_20151005'
rlvdb2[collection_name].insert(cluster_list) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
print("Clusters were written to the collection:", collection_name)
In [ ]:
import json
with open('data.txt', 'w') as outfile:
json.dump(cluster_list, "genocide_clusters.json")
In [ ]:
x = 5
x
In [ ]:
# import pickle
# with open("genocide_clusters_20151005", 'wb') as f:
# pickle.dump(cluster_list, f, pickle.HIGHEST_PROTOCOL)
# print("wrote km user to pickle")
In [3]:
import pickle
with open("genocide_clusters_20151005", 'rb') as f:
cluster_list = pickle.load(f)
In [4]:
cluster_list[0].keys()
Out[4]:
In [249]:
cluster_list_small = []
for c in cluster_list:
cluster_list_small.append({k:v for k,v in c.items() if k in ["cno","cstr","user_entropy","rif","_id","cnoprefix",'twids']})
In [250]:
collection_name = 'genocide_clusters_small_20151005'
rlvdb[collection_name].insert(cluster_list_small) #Each iteration results with a candidate cluster list. Each iteration will have its own list. Therefore they are not mixed.
print("Clusters were written to the collection:", collection_name)
In [ ]: