In [1]:
from utility_code.rule_based_features import *
from utility_code.util import *
from utility_code.create_features import *

tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat("../r/output_fil.tsv")


utility_code/create_features.py:263: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  .sort(inplace=False,columns='i',ascending=False)\
utility_code/create_features.py:266: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  identity_dat.sort("tot",inplace=True,ascending=False)

In [2]:
tw = tw_distant_supervision_identity_dat

In [4]:
stopwords = get_stopwords()

In [9]:
tw = tw[tw.term.apply(lambda x: not x in stopwords and not x.replace(" person","") in stopwords)]

In [18]:
print tw[tw.term.apply(lambda x: ' person' in x)][1:15][['term','tot']].to_latex(index=False)


\begin{tabular}{lr}
\toprule
             term &      tot \\
\midrule
     black person &  1987792 \\
     wrong person &   365667 \\
     young person &   347727 \\
  favorite person &   273561 \\
       old person &   268043 \\
      nice person &   248868 \\
 beautiful person &   235131 \\
   amazing person &   230198 \\
       bad person &   223542 \\
      real person &   219006 \\
  innocent person &   170849 \\
    stupid person &   161701 \\
  homeless person &   156582 \\
    random person &   156044 \\
\bottomrule
\end{tabular}


In [32]:
act = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/identities.txt',"r","utf8")}
wordnet = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/wordnet_identities.txt',"r","utf8")}
racial = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/racial_slur_identities.txt',"r","utf8")}
national = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/national_identities.txt',"r","utf8")}
job = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/job_identities.txt',"r","utf8")}

all_wn = {x.strip() for x in codecs.open('all_wordnet_identities_terms.txt',"r","utf8")}
all_dict = act | wordnet | racial | national | job | all_wn

In [33]:
len(act), len(wordnet), len(racial), len(national), len(job), len(all_dict)


Out[33]:
(930, 8289, 1970, 189, 1567, 12588)

In [34]:
print tw[tw.term.apply(lambda x: not x in all_dict and not ' person' in x)][1:15][['term','tot']].to_latex(index=False)


\begin{tabular}{lr}
\toprule
      term &    tot \\
\midrule
      mess &  94490 \\
     human &  71371 \\
    legend &  52867 \\
      joke &  52579 \\
     pussy &  51172 \\
      thot &  47381 \\
  blessing &  45178 \\
 nightmare &  38819 \\
  disgrace &  33628 \\
     cutie &  33478 \\
    texter &  33067 \\
   goddess &  32475 \\
         g &  26668 \\
       old &  25815 \\
\bottomrule
\end{tabular}