notebook.community

Edit and run



In [381]:

    
import importlib
import relevancer as rlv # https://github.com/cengelif/Relevancer # brach 'ali' contains the latest version.
import pandas as pd
from collections import Counter
import numpy as np

pd.set_option("display.max_colwidth",200)



In [382]:

    
importlib.reload(rlv) # this is just to load the latest version of the Relevancer after we edit the code.
active_col = "active_text"
rlv.set_active_column(active_col)
my_token_pattern = r"[-+]?\d+[.,]?\d+|[#@]?\w+\b|[\U00010000-\U0010ffff]|[.:()\[\],;?!*]{2,4}"
rlv.set_token_pattern(my_token_pattern)



In [383]:

    
negatives_df = pd.read_excel("../../PhD/traineeship/tasks/survey2/Shogun_survey_tr_start.xlsx", sheetname='Negatives')
negatives_df.head(2)









    Out[383]:






  
    
      
      soundbite_text
      annotation
      url
      topicname
      brands
      topicid
      language
      parsetype
      uuid
      sourcetype
      hp rules fired
      mp rules fired
      other rules
      sentiment
      mp sentiment
    
  
  
    
      0
      www.thecoca-colacompany.com 1964te Türkiyede faaliyetlerine başlayan Coca-Cola, 8 farklı kategorideki 15 markasıyla alkolsüz içecek üretiyor.
      NaN
      http://forum.iyinet.com/teknoloji-rss-haberler/263946-turkiye-nin-ilk-profesyonel-espor-ligi.html#post1395917
      Coca Cola-tr
      Coca-Cola,#cocacola,@CocaCola_TR
      454075
      tr
      ot
      a3e0b3e3-cad9-4968-9fea-3dcec9f62fc1
      Forums
      Negatives.@ObjectObjQ_nW_badW|annoyW|hateW_!negative_!negative_Negatives
      Positives.@ObjQ_pW_goodW|happyW|loveW_!negative_!negative_Positives
      NaN
      Negative
      Mixed
    
    
      1
      Seçimi boykot eden = Coca-Cola'yı boykot edip fanta içen vatandaş 😂😂😂 Sizinde oylarınız hayırlı olsun hamilinize yazıldı😄
      NaN
      http://twitter.com/_SeVoM_/statuses/498444683526549504
      Coca Cola-tr
      Coca-Cola,#cocacola,@CocaCola_TR
      454075
      tr
      tw
      77dbf5d9-9a81-45ef-ae23-95e0489940f3
      Twitter
      Negatives.Brand_does_sth_bad
      NaN
      Positives.@ObjectObjQ_pW_goodW|happyW|loveW_!negative_!negative_Positives, Negatives.Brand_does_sth_bad
      Negative
      Mixed



In [384]:

    
len(negatives_df), negatives_df.soundbite_text[:4]









    Out[384]:





(636,
 0                                                            www.thecoca-colacompany.com 1964te Türkiyede faaliyetlerine başlayan Coca-Cola, 8 farklı kategorideki 15 markasıyla alkolsüz içecek üretiyor.
 1                                                                                Seçimi boykot eden = Coca-Cola'yı boykot edip fanta içen vatandaş 😂😂😂 Sizinde oylarınız hayırlı olsun hamilinize yazıldı😄
 2                                                                                                                             Boşuna ismini coca-cola şişelerinde arama gülüm,kappe yazmamışlardır........
 3    Coca-Cola, Muhtar Kent'in Priminde Büyük Bir Kesinti Yaptı Haber Yayın Tarihi : 11.03.2014 08:46 [5770413] Düşen kârı gerekçe gösteren Coca-Cola hissedarları, CEO'ları Muhtar Kent'in primini kesti.
 Name: soundbite_text, dtype: object)



In [385]:

    
rlv.tok_result_col = "soundbite_text" # for compatibility.



In [386]:

    
# This setting does not do anything to the tweet set!!
negatives_df[active_col] = negatives_df["soundbite_text"].copy()
negatives_df = rlv.tok_results(negatives_df, elimrt = False)









    



Available attributes of the tweets: Index(['soundbite_text', 'annotation', 'url', 'topicname', 'brands', 'topicid',
       'language', 'parsetype', 'uuid', 'sourcetype', 'hp rules fired',
       'mp rules fired', 'other rules', 'sentiment', 'mp sentiment',
       'active_text', 'texttok'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 636 entries, 0 to 635
Data columns (total 17 columns):
soundbite_text    636 non-null object
annotation        0 non-null float64
url               636 non-null object
topicname         636 non-null object
brands            636 non-null object
topicid           636 non-null int64
language          636 non-null object
parsetype         636 non-null object
uuid              636 non-null object
sourcetype        636 non-null object
hp rules fired    636 non-null object
mp rules fired    164 non-null object
other rules       346 non-null object
sentiment         636 non-null object
mp sentiment      636 non-null object
active_text       636 non-null object
texttok           636 non-null object
dtypes: float64(1), int64(1), object(15)
memory usage: 89.4+ KB

tweet set summary: None
0                                                            www.thecoca-colacompany.com 1964te Türkiyede faaliyetlerine başlayan Coca-Cola, 8 farklı kategorideki 15 markasıyla alkolsüz içecek üretiyor.
1                                                                                Seçimi boykot eden = Coca-Cola'yı boykot edip fanta içen vatandaş 😂😂😂 Sizinde oylarınız hayırlı olsun hamilinize yazıldı😄
2                                                                                                                             Boşuna ismini coca-cola şişelerinde arama gülüm,kappe yazmamışlardır........
3    Coca-Cola, Muhtar Kent'in Priminde Büyük Bir Kesinti Yaptı Haber Yayın Tarihi : 11.03.2014 08:46 [5770413] Düşen kârı gerekçe gösteren Coca-Cola hissedarları, CEO'ları Muhtar Kent'in primini kesti.
4                                  İsrail, Gazze, Twitter Enver Yılmaz, Gazze için Coca-Cola içmeyin, Osman Zolan, Tahsin Babaş, Mehmet Özhaseki, YORUM YAP Yorum yazmanız için üye olmanız gerekmektedir.
Name: soundbite_text, dtype: object

tweets are NOT tokenized.
Retweets were NOT eliminated.



In [387]:

    
negatives_df = rlv.normalize_text(negatives_df) ## urls become urlurlurl, user names become usrusrusr
len(negatives_df), negatives_df.columns









    Out[387]:





(636,
 Index(['soundbite_text', 'annotation', 'url', 'topicname', 'brands', 'topicid',
        'language', 'parsetype', 'uuid', 'sourcetype', 'hp rules fired',
        'mp rules fired', 'other rules', 'sentiment', 'mp sentiment',
        'active_text', 'texttok'],
       dtype='object'))



In [388]:

    
def eliminate_duplicates_bucketwise(df, duplicate_elim_func, step=10000):
    """
    The actual near-duplicate detection algorithm is not memory-efficient enough. Therefore,
    we mostly need to divide the data in the buckets, eliminate duplicates, merge the data, shuffle it, and repeat
    the same cycle, until no-duplicate detected in any bucket. That may take long for big data sets. Conditions can
    be relaxed to be quicker but leave a few duplicates.
    """
            
    print("starting, length:",len(df))
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace=True, drop=True)

    tmp_df2 = pd.DataFrame()
    for i in range(0, len(df), step):
        tmp_unique = duplicate_elim_func(df[i:i+step], similarity_threshold=0.10, debug=True, debug_threshold=10000)
        tmp_df2 = pd.concat([tmp_df2, tmp_unique], ignore_index=True)

    if len(df) > len(tmp_df2):
        print(str(len(df) - len(tmp_df2))+" tweets were eliminated!")
        return eliminate_duplicates_recursively(tmp_df2, duplicate_elim_func)

    return df



In [389]:

    
tweetsDF_uniq = eliminate_duplicates_bucketwise(negatives_df.copy(), rlv.get_and_eliminate_near_duplicate_tweets)









    



starting, length: 636
<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 296
Data columns (total 17 columns):
soundbite_text    600 non-null object
annotation        0 non-null float64
url               600 non-null object
topicname         600 non-null object
brands            600 non-null object
topicid           600 non-null int64
language          600 non-null object
parsetype         600 non-null object
uuid              600 non-null object
sourcetype        600 non-null object
hp rules fired    600 non-null object
mp rules fired    148 non-null object
other rules       330 non-null object
sentiment         600 non-null object
mp sentiment      600 non-null object
active_text       600 non-null object
texttok           600 non-null object
dtypes: float64(1), int64(1), object(15)
memory usage: 84.4+ KB
36 tweets were eliminated!
starting, length: 600
<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 0 to 2
Data columns (total 17 columns):
soundbite_text    596 non-null object
annotation        0 non-null float64
url               596 non-null object
topicname         596 non-null object
brands            596 non-null object
topicid           596 non-null int64
language          596 non-null object
parsetype         596 non-null object
uuid              596 non-null object
sourcetype        596 non-null object
hp rules fired    596 non-null object
mp rules fired    148 non-null object
other rules       329 non-null object
sentiment         596 non-null object
mp sentiment      596 non-null object
active_text       596 non-null object
texttok           596 non-null object
dtypes: float64(1), int64(1), object(15)
memory usage: 83.8+ KB
4 tweets were eliminated!
starting, length: 596
There is not any group of near-duplicate tweets.



In [390]:

    
print(len(tweetsDF_uniq))
tweetsDF_uniq[["active_text"]][:10]









    



596






    Out[390]:






  
    
      
      active_text
    
  
  
    
      0
      urlurlurl tekirdağ valisi. fanta içerek coca colayı protesto ediyor . [ geri̇ zekali adam ] fanta"da colanın. usrusrusr
    
    
      1
      yegane önerim; yeni güncelleme gelene kadar iphone 6'dan daha aşağı model kullananlar indirmesinler.
    
    
      2
      rt usrusrusr urlurlurl coca-cola'dan boykot üzerine açıklama..demek ki doğru yoldayız.#cocacolayahayır boykota devam urlurlurl
    
    
      3
      mesela iphone'u tek elle kullanamazsınız çünkü geri tuşu yok ama sansung öyle mi cnm samsung herşeyi düşünüyo
    
    
      4
      alıntı: cagatay16'isimli üyeden alıntı ben uyarayımda bizim motorlarda kullanır çoğu yere zımbalar resmen ama 15 20 bin km sonra balata değişiminde bir bakarsın disk incecik kalmış sende 3-4 sene ...
    
    
      5
      sanırım iphone almam gerek. bu yol, yol değil.
    
    
      6
      rt usrusrusr tekirdağ valisi haberi gerçekmiş ya la :) coca colayı fanta içerek protesto eden vali kafasından istiyorum. uçuş bedava
    
    
      7
      "usrusrusr - ayrilik kolay mi senin yaninda - kolay degil, fanta oh nan 😂😂😂😂😂😂😂😂" ptn j'suis drôle ahahahahahahaah off de moi
    
    
      8
      coca cola içmeyin, fanta için coca cola i̇srail malı eylem olarak kasalarca coca cola alıp yere dökenler vardı bir de.
    
    
      9
      biraz önce bir çift kenwood kfc m1634a 3 yollu 60 watt rms 270 watt max. twetterli hoparlör siparişi verdim media markt dan alyan takımı da aldım birde gelsin takalım bakalım nasıl olacak sizinled...



In [391]:

    
# processing a complete tweet set with those fields would not need that dummy assignments.
tweetsDF_uniq["id_str"] = '-1'
tweetsDF_uniq["user_id"] = '-1'



In [392]:

    
# min_clusters should not be too high. Since the (near-)duplicates are eliminated, it will be harder to spot clusters.
cluster_list = rlv.create_clusters(tweetsDF_uniq, my_token_pattern, min_dist_thres=0.725, max_dist_thres=0.8, min_max_diff_thres=0.4, nameprefix='1-', min_clusters=1, user_identifier='user_id')



In [393]:

    
print("Number of clusters:",len(cluster_list))
print("available cluster information:", cluster_list[0].keys())









    



Number of clusters: 1
available cluster information: dict_keys(['twids', 'ctweettuplelist', 'cno', 'rif', 'cstr', 'cnoprefix', 'user_entropy'])



In [394]:

    
i = 0



In [395]:

    
print("No:",cluster_list[i]['cno'])
print("CStr:",cluster_list[i]['cstr'])
print("Cluster Tuple List:")
print(*[(c[0],c[2]) for c in cluster_list[i]['ctweettuplelist']], sep='\n')
i+=1









    



No: 15
CStr: cluster number and size are: 15    7

Cluster Tuple List:
(0.55700751749067523, 'sevmek kolaysa ayrılmak fantadır bu kadar basit :)')
(0.55906749532565181, 'sevmek kolaysa ayrılmak fanta :ddd')
(0.60384161428737004, 'sevmek kolaysa ayrılmak coca-cola sjssmsksm')
(0.62925791980852275, 'bir sizofren atasözü derki :d sevmek kolaysa ayrılmak fanta :d')
(0.76697097023171534, 'rt usrusrusr unutmak kolaysa alışmak fantadır amk :d')
(0.7695231273176113, 'bu merdivenden inmek kolaysa çıkmak fantadır:asdfg -burak.')
(0.79679893841454785, 'bensizlik kolaysa sensizlik fanta amk')



In [396]:

    
print("Reverse Frequency of the terms:")
print(i)
for k, v in Counter({int(k):v for k,v in cluster_list[i-1]['rif'].items()}).items():
    if k>1:
        print(k,":",v)









    



Reverse Frequency of the terms:
1
2 : ['amk', 'ayrılmak fanta', 'bu']
3 : ['fantadır', 'd', 'fanta']
4 : ['kolaysa ayrılmak', 'sevmek', 'sevmek kolaysa', 'ayrılmak']
7 : ['kolaysa']



In [ ]:

	soundbite_text	annotation	url	topicname	brands	topicid	language	parsetype	uuid	sourcetype	hp rules fired	mp rules fired	other rules	sentiment	mp sentiment
0	www.thecoca-colacompany.com 1964te Türkiyede faaliyetlerine başlayan Coca-Cola, 8 farklı kategorideki 15 markasıyla alkolsüz içecek üretiyor.	NaN	http://forum.iyinet.com/teknoloji-rss-haberler/263946-turkiye-nin-ilk-profesyonel-espor-ligi.html#post1395917	Coca Cola-tr	Coca-Cola,#cocacola,@CocaCola_TR	454075	tr	ot	a3e0b3e3-cad9-4968-9fea-3dcec9f62fc1	Forums	Negatives.@ObjectObjQ_nW_badW\|annoyW\|hateW_!negative_!negative_Negatives	Positives.@ObjQ_pW_goodW\|happyW\|loveW_!negative_!negative_Positives	NaN	Negative	Mixed
1	Seçimi boykot eden = Coca-Cola'yı boykot edip fanta içen vatandaş 😂😂😂 Sizinde oylarınız hayırlı olsun hamilinize yazıldı😄	NaN	http://twitter.com/_SeVoM_/statuses/498444683526549504	Coca Cola-tr	Coca-Cola,#cocacola,@CocaCola_TR	454075	tr	tw	77dbf5d9-9a81-45ef-ae23-95e0489940f3	Twitter	Negatives.Brand_does_sth_bad	NaN	Positives.@ObjectObjQ_pW_goodW\|happyW\|loveW_!negative_!negative_Positives, Negatives.Brand_does_sth_bad	Negative	Mixed

	active_text
0	urlurlurl tekirdağ valisi. fanta içerek coca colayı protesto ediyor . [ geri̇ zekali adam ] fanta"da colanın. usrusrusr
1	yegane önerim; yeni güncelleme gelene kadar iphone 6'dan daha aşağı model kullananlar indirmesinler.
2	rt usrusrusr urlurlurl coca-cola'dan boykot üzerine açıklama..demek ki doğru yoldayız.#cocacolayahayır boykota devam urlurlurl
3	mesela iphone'u tek elle kullanamazsınız çünkü geri tuşu yok ama sansung öyle mi cnm samsung herşeyi düşünüyo
4	alıntı: cagatay16'isimli üyeden alıntı ben uyarayımda bizim motorlarda kullanır çoğu yere zımbalar resmen ama 15 20 bin km sonra balata değişiminde bir bakarsın disk incecik kalmış sende 3-4 sene ...
5	sanırım iphone almam gerek. bu yol, yol değil.
6	rt usrusrusr tekirdağ valisi haberi gerçekmiş ya la :) coca colayı fanta içerek protesto eden vali kafasından istiyorum. uçuş bedava
7	"usrusrusr - ayrilik kolay mi senin yaninda - kolay degil, fanta oh nan 😂😂😂😂😂😂😂😂" ptn j'suis drôle ahahahahahahaah off de moi
8	coca cola içmeyin, fanta için coca cola i̇srail malı eylem olarak kasalarca coca cola alıp yere dökenler vardı bir de.
9	biraz önce bir çift kenwood kfc m1634a 3 yollu 60 watt rms 270 watt max. twetterli hoparlör siparişi verdim media markt dan alyan takımı da aldım birde gelsin takalım bakalım nasıl olacak sizinled...