In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter
lemmatizer=WordNetLemmatizer()
hm_lines=1000000
def create_lexicon(pos,neg):
lexicon=[]
for fi in [pos,neg]:
with open(fi, 'ri') as f:
contents=f.readlines()
for l in contents[:hm_lines]:
all_words=word_tokenize(l.lower())
lexicon+=list(all_words)
lexicon=[lemmatizer.lemmatize(i) for i in lexicon]
w_counts=Counter(lexicon)
l2=[]
for w in w_counts:
if 1000 > w_counts[w] >50:
l2.append(w)
return l2
def sample_handling(sample, lexicon, classification):
featureset=[]
with open(sample, 'ri') as f:
contents=f.readlines()
for l in contents[:hm_lines]:
current_words=word_tokenize(l.lower())
current_words=[lemmatizer.lemmatize(i) for i in current_words]
features=np.zeros(len(lexicon))
for word in current_words:
if word.lower() in lexicon:
index_value=lexicon.index(word.lower())
feature[index_value]+=1
features=list(features)
featureset.append([features, classification])
return featureset
def creat_featuresets_and_labels(pos, neg, test_size=0.1):
lexicon=create_lexicon(pos,neg)
features=[]
features+=sample_handling('pos.txt', lexicon, [1,0])
features+=sample_handling('neg.txt', lexicon, [0,1])
random.shuffle(features)
features=np.array(features)
testing_size=int(test_size*len(features))
train_x=list(features[:,0][:-testing_size]) #creates a list of the 0th element of every list in the overall list
train_y=list(features[:,1][:-testing_size])
test_x=list(features[:,0][-testing_size:])
test_y=list(features[:,1][-testing_size:])
return train_x, train_y, test_x, test_y
In [ ]: