In [7]:
from collections import Counter
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier as RF
from itertools import islice, tee
from nltk.corpus import stopwords
In [8]:
def get_reviews():
for review in open('test.txt'):
review = review.replace('\n', '')
yield review
def tokenize(sentence):
words = re.findall("[a-zA-Z]+", sentence)
bigram = []
for gram in generate_ngrams(words, 2):
bigram.append('{0} {1}'.format(gram[0], gram[1]))
# take out stop words
# words = [w for w in words if w not in stopwords.words("english")]
words.extend(bigram)
return words
def generate_ngrams(lst, n):
ilst = lst
while True:
a, b = tee(ilst)
l = tuple(islice(a, n))
if len(l) == n:
yield l
next(b)
ilst = b
else:
break
def get_classes():
classes = {}
for train_pair in open('words_old.txt'):
train_pair = train_pair.lower().replace('\n' , '').split()
classes[train_pair[0]] = train_pair[1]
return classes
def generate():
X = []
Y = []
x = []
y = []
classes = get_classes()
for review in get_reviews():
for word in review.lower().split():
if word in classes:
# take out the correlated feature
# review = review.replace(word, '')
X.append(Counter(tokenize(review)))
Y.append(classes[word])
# break
return X, Y
In [5]:
print(tokenize('this is a sentence, thanks price'))
In [6]:
X, Y = generate()
In [151]:
len(X)
Out[151]:
In [152]:
from collections import defaultdict
new_X = []
new_Y = []
counter = defaultdict(int)
for x, y in zip(X, Y):
if counter[y] < 7000:
new_X.append(x)
new_Y.append(y)
counter[y] += 1
In [153]:
len(new_X)
Out[153]:
In [154]:
v = DictVectorizer()
X_v = v.fit_transform(new_X)
t = TfidfTransformer(use_idf=False, sublinear_tf=True)
X_t = t.fit_transform(X_v)
del X
In [155]:
X_t.shape
Out[155]:
In [156]:
from sklearn.linear_model import SGDClassifier
In [157]:
from sklearn.cross_validation import train_test_split
In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_t, new_Y, test_size=0.4, random_state=0)
In [159]:
X_validation_set, X_testing_set, y_validation_set, y_testing_set = train_test_split(X_test, y_test,
test_size=0.5, random_state=0)
In [162]:
#clf_ = SGDClassifier(penalty="l1", loss='log', alpha=0.000001) # 0.82945893257807035
clf_ = SGDClassifier(penalty="elasticnet", loss='log', alpha=0.000001, n_jobs=4)
clf_.fit(X_train, y_train)
Out[162]:
In [163]:
clf_.score(X_validation_set, y_validation_set)
Out[163]:
In [164]:
clf_.score(X_validation_set, y_validation_set)
Out[164]:
In [165]:
from sklearn.svm import LinearSVC
# svc = LogisticRegression(dual=True, C=30)
svc = LinearSVC()
svc.fit(X_train, y_train)
Out[165]:
In [166]:
svc.score(X_validation_set, y_validation_set)
Out[166]:
In [167]:
svc.score(X_validation_set, y_validation_set)
Out[167]:
In [24]:
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report
namesClasses = ['Satisfaction', 'Content', 'Stability', 'Security',
'Privacy', 'Pricing', 'Usefulness']
kf = KFold(new_Y, n_folds=5)
y_pred = new_Y * 0
for train, test in kf:
X_train, X_test, y_train, y_test = X_t[train,:], X_t[test,:], new_Y[train], new_Y[test]
clf_.fit(X_train, y_train)
y_pred[test] = clf.predict(X_test)
print classification_report(new_Y, y_pred, target_names=namesClasses)
In [25]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
In [26]:
#svc = LogisticRegression(dual=True, C=30) # 0.781 with n = 19
#svc = LogisticRegression(dual=True, C=8)
svc = MultinomialNB()
In [27]:
svc.fit(X_train, y_train)
Out[27]:
In [28]:
svc.score(X_validation_set, y_validation_set)
Out[28]:
In [29]:
svc.score(X_testing_set, y_testing_set)
Out[29]:
In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_t, new_Y, test_size=0.2, random_state=0)
In [96]:
def learning_curves(clf_):
p = 0.05
test_errors = []
train_errors = []
while(p < 1.0):
X_cur_train, _, y_cur_train, _ = train_test_split(X_train, y_train, test_size=(1 - p), random_state=0)
clf_.fit(X_cur_train, y_cur_train)
train_error = clf_.score(X_cur_train, y_cur_train)
print('Training accuracy %s' % train_error)
train_errors.append(train_error)
test_error = clf_.score(X_test, y_test)
print('Test accuracy %s' % test_error)
test_errors.append(test_error)
p += 0.05
In [97]:
learning_curves(svc)
In [111]:
import matplotlib.pyplot as plt
%matplotlib inline
n = range(0, len(train_errors))
plt.plot(n, [1 - i for i in train_errors] , 'g-', n, [1 - i for i in test_errors], 'r-', label='error', linewidth=2)
plt.show()
In [98]:
def predict_review(clf_, review):
d = ['Satisfaction', 'Content', 'Stability', 'Security',
'Privacy', 'Pricing', 'Usefulness']
x = t.transform(v.transform(Counter(tokenize(review))))
return sorted([(d[i], p) for i, p in enumerate(clf_.predict_proba(x)[0]) if p > 0.2], key=lambda x: -x[1])
In [99]:
review1 = "pretty cool app i really like the concept of following stories as opposed to people or sites excited to see where they go with it from here latest update made the ui look much cleaner still not many options for sharing as of now (only reason i didn't give it a 5) but that should hopefully change with future updates;"
In [101]:
predict_review(clf_, "this is a super funny app like it's really awesome and gets but maybe you should be able to store your videos in the app so you can look back on funny things get this ap")
Out[101]:
In [102]:
review2 = "i love this in many ways: easy to use; really helps me track calories; huge database; etc etc downsides: twitchy ; needs more options for portion sizes (3rds wd be nice); items are excessively repeated; no way to track how you are doing across multiple days w/o going online nevertheless this is well worth the small cost, definitely recommend it"
predict_review(clf_, review2)
Out[102]:
In [103]:
review3 = 'it lulls my grandsons right to sleep'
predict_review(clf_, review3)
Out[103]:
In [104]:
review3 = """enjoying this game it's pretty easy to learn yet challenging at the same time the instructions are pretty vague but you can pretty much figure everything out after a few times examplei didn't know that you go tap the machine in the bottom right corner & in a few seconds it dispenses a pill that will add 1 heart to a patient as long as they are not in line to check out i quickly learned this but some of their bonuses aren't explained very well if at all in the tutorial"""
predict_review(clf_, review3)
Out[104]:
In [105]:
review4 = "the idea of this app is great but it is very unstable on the iphone and seems to revert to a web page version that isn't as mobile friendly this app needs better integration with home depot's money online shopping"
predict_review(clf_, review4)
Out[105]:
In [106]:
predict_review(clf_, 'this app sucks. it is not good at anything; can I get my money back please???!!!')
Out[106]:
In [107]:
predict_review(clf_, "i purchased this app months ago it is a favorite recently it is asking me to buy all the songs that we have had in the app what gives? is there something wrong? please help very frustrated customer")
Out[107]:
In [108]:
predict_review(clf_, "i love the article synopsis but i feel like the flipboard ux is easier to use the half scroll / page flip gets me once in a while - when the update is slightly longer than my current view")
Out[108]:
In [109]:
predict_review(clf_, " these days it helps for apps to filter through news and present the best this app is great and easy to scroll through to get the daily highlights")
Out[109]:
In [110]:
predict_review(clf_, "i have been looking for a quality alternative to costly wsj and nyt subscriptions and this is it circa has great articles and all the info i need to carry a conversation with someone on the topic i love it highly recommended")
Out[110]:
In [213]:
from sklearn.externals import joblib
In [217]:
joblib.dump(clf_, '../../review_classifier/review_classification.pkl')
Out[217]:
In [227]:
joblib.dump(v, '../../review_classifier/dict_vectorizer.pkl')
Out[227]:
In [228]:
joblib.dump(t, '../../review_classifier/tfidf_transformer.pkl')
Out[228]:
In [ ]: