In [1]:
from utility_code.rule_based_features import *
from utility_code.util import *
from utility_code.create_features import *
tw_distant_supervision_identity_dat = get_twitter_distant_supervision_identity_dat("../r/output_fil.tsv")
In [2]:
tw = tw_distant_supervision_identity_dat
In [4]:
stopwords = get_stopwords()
In [9]:
tw = tw[tw.term.apply(lambda x: not x in stopwords and not x.replace(" person","") in stopwords)]
In [18]:
print tw[tw.term.apply(lambda x: ' person' in x)][1:15][['term','tot']].to_latex(index=False)
In [32]:
act = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/identities.txt',"r","utf8")}
wordnet = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/wordnet_identities.txt',"r","utf8")}
racial = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/racial_slur_identities.txt',"r","utf8")}
national = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/national_identities.txt',"r","utf8")}
job = {x.strip() for x in codecs.open('dictionaries/my_dictionaries/job_identities.txt',"r","utf8")}
all_wn = {x.strip() for x in codecs.open('all_wordnet_identities_terms.txt',"r","utf8")}
all_dict = act | wordnet | racial | national | job | all_wn
In [33]:
len(act), len(wordnet), len(racial), len(national), len(job), len(all_dict)
Out[33]:
In [34]:
print tw[tw.term.apply(lambda x: not x in all_dict and not ' person' in x)][1:15][['term','tot']].to_latex(index=False)