In [ ]:
import random
with open('rt-polarity.neg.utf8', 'r') as f:
negative_list = ['-1 '+i for i in f]
with open("rt-polarity.pos.utf8", "r") as f:
positive_list = ["+1"+i for i in f]
#for sentence in temp:
# positive_list.append('+1 '+"".join([i.encode('replace') for i in sentence]))
concatenate = positive_list + negative_list
random.shuffle(concatenate)
with open('sentiment.txt', 'w') as f:
f.write("".join(concatenate))
In [ ]:
from nltk.corpus import stopwords
stopwords_list = [s for s in stopwords.words('english')]
print(stopwords_list)
In [ ]:
from nltk.stem.porter import PorterStemmer
def feature(sentence):
porter = PorterStemmer()
result = []
label = sentence[0:2]
for s in sentence[3:].split(' '):
try:
result.append(porter.stem(s))
except KeyError:
pass
return (label + " " + " ".join(result))
In [ ]:
feature("+1 intensely romantic , thought-provoking and even an engaging mystery . ")
In [68]:
# import passages to construct logistic regression and learn the model.
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
In [69]:
tfv = TfidfVectorizer(encoding='utf-8', lowercase=True,
stop_words=ENGLISH_STOP_WORDS,
#token_pattern='(?u)\b\w\w+\b',
ngram_range=(1, 2))
In [72]:
with open('sentiment.txt') as f:
features = [(s[:2], s[3:]) for s in f]
# make label list
label = [i[0] for i in features]
# make sentence list that is removed English Stop Words
sentence = []
for i in features:
temp = i[1].split(' ')
temp2 = [i+' ' for i in temp]
s = "".join(temp2)
sentence.append(s)
In [94]:
tfv_vector = tfv.fit_transform("".join(sentence).split(' '))
In [109]:
Out[109]:
In [89]:
tfv_vector.
Out[89]: