In [10]:
import os
files = os.listdir('aclimdb/train/pos')

In [12]:
first_file = files[0]
with open('aclimdb/train/pos/{}'.format(first_file),'r',encoding='utf-8') as f:
    review = f.read()
    f.close()

In [13]:
review


Out[13]:
'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [16]:
#전체 긍정 데이터 저장하기
pos_train_list=[]
for file in files:
    with open('aclimdb/train/pos/{}'.format(file),'r',encoding='utf-8') as f:
        review = f.read()
        f.close()
    pos_train_list.append(review)
print(len(pos_train_list))


12500

In [17]:
import nltk
from nltk.corpus import sentiwordnet as swn

In [18]:
swn.senti_synsets('hate')


Out[18]:
<filter at 0x1f1ceb4f780>

In [19]:
list(swn.senti_synsets('hate'))


Out[19]:
[SentiSynset('hate.n.01'), SentiSynset('hate.v.01')]

In [20]:
list(swn.senti_synsets('hate','v'))


Out[20]:
[SentiSynset('hate.v.01')]

In [21]:
list(swn.senti_synsets('hate','v'))[0].pos_score()


Out[21]:
0.0

In [22]:
list(swn.senti_synsets('hate','v'))[0].neg_score()


Out[22]:
0.75

In [27]:
def word_sentiment_calculator(word, tag):
    pos_score =0
    neg_score =0
    
    if 'NN' in tag and len(list(swn.senti_synsets(word, 'n')))>0:
        syn_set = list(swn.senti_synsets(word, 'n'))
    elif 'VB' in tag and len(list(swn.senti_synsets(word, 'v')))>0:
        syn_set = list(swn.senti_synsets(word, 'v'))
    elif 'JJ' in tag and len(list(swn.senti_synsets(word, 'a')))>0:
        syn_set = list(swn.senti_synsets(word, 'a'))
    elif 'RB' in tag and len(list(swn.senti_synsets(word, 'r')))>0:
        syn_set = list(swn.senti_synsets(word, 'r'))
    else :
        return (0,0)
    
    for syn in syn_set:
        pos_score += syn.pos_score()
        neg_score += syn.neg_score()
    return (pos_score/len(syn_set), neg_score/len(syn_set))

In [ ]:


In [30]:
word_sentiment_calculator('love','NN')


Out[30]:
(0.22916666666666666, 0.0)

In [31]:
word_sentiment_calculator('love','VB')


Out[31]:
(0.625, 0.03125)

In [38]:
sent = 'I hate you'
tokens = nltk.word_tokenize(sent)
pos_tags = nltk.pos_tag(tokens)
pos_tags


Out[38]:
[('I', 'PRP'), ('hate', 'VBP'), ('you', 'PRP')]

In [52]:
def sentence_sentiment_calculator(a):
    tokens = nltk.word_tokenize(a)
    pos_tags = nltk.pos_tag(tokens)
    pos_score = 0
    neg_score =0
    for word, tag in pos_tags:
        pos_score += word_sentiment_calculator(word,tag)[0]
        neg_score += word_sentiment_calculator(word,tag)[1]
    return (pos_score, neg_score)

In [54]:
sentence_sentiment_calculator(review)


Out[54]:
(1.1239035087719298, 0.2609649122807018)

In [ ]: