In [381]:
import importlib
import relevancer as rlv # https://github.com/cengelif/Relevancer # brach 'ali' contains the latest version.
import pandas as pd
from collections import Counter
import numpy as np
pd.set_option("display.max_colwidth",200)
In [382]:
importlib.reload(rlv) # this is just to load the latest version of the Relevancer after we edit the code.
active_col = "active_text"
rlv.set_active_column(active_col)
my_token_pattern = r"[-+]?\d+[.,]?\d+|[#@]?\w+\b|[\U00010000-\U0010ffff]|[.:()\[\],;?!*]{2,4}"
rlv.set_token_pattern(my_token_pattern)
In [383]:
negatives_df = pd.read_excel("../../PhD/traineeship/tasks/survey2/Shogun_survey_tr_start.xlsx", sheetname='Negatives')
negatives_df.head(2)
Out[383]:
In [384]:
len(negatives_df), negatives_df.soundbite_text[:4]
Out[384]:
In [385]:
rlv.tok_result_col = "soundbite_text" # for compatibility.
In [386]:
# This setting does not do anything to the tweet set!!
negatives_df[active_col] = negatives_df["soundbite_text"].copy()
negatives_df = rlv.tok_results(negatives_df, elimrt = False)
In [387]:
negatives_df = rlv.normalize_text(negatives_df) ## urls become urlurlurl, user names become usrusrusr
len(negatives_df), negatives_df.columns
Out[387]:
In [388]:
def eliminate_duplicates_bucketwise(df, duplicate_elim_func, step=10000):
"""
The actual near-duplicate detection algorithm is not memory-efficient enough. Therefore,
we mostly need to divide the data in the buckets, eliminate duplicates, merge the data, shuffle it, and repeat
the same cycle, until no-duplicate detected in any bucket. That may take long for big data sets. Conditions can
be relaxed to be quicker but leave a few duplicates.
"""
print("starting, length:",len(df))
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
tmp_df2 = pd.DataFrame()
for i in range(0, len(df), step):
tmp_unique = duplicate_elim_func(df[i:i+step], similarity_threshold=0.10, debug=True, debug_threshold=10000)
tmp_df2 = pd.concat([tmp_df2, tmp_unique], ignore_index=True)
if len(df) > len(tmp_df2):
print(str(len(df) - len(tmp_df2))+" tweets were eliminated!")
return eliminate_duplicates_recursively(tmp_df2, duplicate_elim_func)
return df
In [389]:
tweetsDF_uniq = eliminate_duplicates_bucketwise(negatives_df.copy(), rlv.get_and_eliminate_near_duplicate_tweets)
In [390]:
print(len(tweetsDF_uniq))
tweetsDF_uniq[["active_text"]][:10]
Out[390]:
In [391]:
# processing a complete tweet set with those fields would not need that dummy assignments.
tweetsDF_uniq["id_str"] = '-1'
tweetsDF_uniq["user_id"] = '-1'
In [392]:
# min_clusters should not be too high. Since the (near-)duplicates are eliminated, it will be harder to spot clusters.
cluster_list = rlv.create_clusters(tweetsDF_uniq, my_token_pattern, min_dist_thres=0.725, max_dist_thres=0.8, min_max_diff_thres=0.4, nameprefix='1-', min_clusters=1, user_identifier='user_id')
In [393]:
print("Number of clusters:",len(cluster_list))
print("available cluster information:", cluster_list[0].keys())
In [394]:
i = 0
In [395]:
print("No:",cluster_list[i]['cno'])
print("CStr:",cluster_list[i]['cstr'])
print("Cluster Tuple List:")
print(*[(c[0],c[2]) for c in cluster_list[i]['ctweettuplelist']], sep='\n')
i+=1
In [396]:
print("Reverse Frequency of the terms:")
print(i)
for k, v in Counter({int(k):v for k,v in cluster_list[i-1]['rif'].items()}).items():
if k>1:
print(k,":",v)
In [ ]: