In [7]:

    
from collections import Counter
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier as RF

from itertools import islice, tee
from nltk.corpus import stopwords

Feature extraction step



In [8]:

    
def get_reviews():
    for review in open('test.txt'):
        review = review.replace('\n', '')
        yield review

def tokenize(sentence):
    words = re.findall("[a-zA-Z]+", sentence)
    bigram = []

    for gram in generate_ngrams(words, 2):
        bigram.append('{0} {1}'.format(gram[0], gram[1]))

    # take out stop words
#     words = [w for w in words if w not in stopwords.words("english")]
    words.extend(bigram)
    
    return words

def generate_ngrams(lst, n):
    ilst = lst
    while True:
        a, b = tee(ilst)
        l = tuple(islice(a, n))
        if len(l) == n:
            yield l
            next(b)
            ilst = b
        else:
            break
    
def get_classes():
    classes = {}
    for	train_pair in open('words_old.txt'):
        train_pair = train_pair.lower().replace('\n' , '').split()
        classes[train_pair[0]] = train_pair[1]
    return classes
    
def generate():
    X = []
    Y = []
    x = []
    y = []
    classes = get_classes()

    for review in get_reviews():
        for word in review.lower().split():
            if word in classes:
                # take out the correlated feature
#                 review = review.replace(word, '')
                X.append(Counter(tokenize(review)))
                Y.append(classes[word])
#                 break
    return X, Y



In [5]:

    
print(tokenize('this is a sentence, thanks price'))









    



['this', 'is', 'a', 'sentence', 'thanks', 'price', 'this is', 'is a', 'a sentence', 'sentence thanks', 'thanks price']



In [6]:

    
X, Y = generate()



In [151]:

    
len(X)









    Out[151]:





940044

Balance the classes



In [152]:

    
from collections import defaultdict
new_X = []
new_Y = []
counter = defaultdict(int)
for x, y in zip(X, Y):
    if counter[y] < 7000:
        new_X.append(x)
        new_Y.append(y)
        counter[y] += 1



In [153]:

    
len(new_X)









    Out[153]:





49000



In [154]:

    
v = DictVectorizer()
X_v = v.fit_transform(new_X)
t = TfidfTransformer(use_idf=False, sublinear_tf=True)
X_t = t.fit_transform(X_v)
del X



In [155]:

    
X_t.shape









    Out[155]:





(49000, 466680)



In [156]:

    
from sklearn.linear_model import SGDClassifier



In [157]:

    
from sklearn.cross_validation import train_test_split

Creating testing, training and validation sets



In [158]:

    
X_train, X_test, y_train, y_test = train_test_split(X_t, new_Y, test_size=0.4, random_state=0)



In [159]:

    
X_validation_set, X_testing_set, y_validation_set, y_testing_set = train_test_split(X_test, y_test,
                                                                                    test_size=0.5, random_state=0)

train



In [162]:

    
#clf_ = SGDClassifier(penalty="l1", loss='log', alpha=0.000001) # 0.82945893257807035
clf_ = SGDClassifier(penalty="elasticnet", loss='log', alpha=0.000001, n_jobs=4)

clf_.fit(X_train, y_train)









    Out[162]:





SGDClassifier(alpha=1e-06, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='log', n_iter=5, n_jobs=4, penalty='elasticnet', power_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False)



In [163]:

    
clf_.score(X_validation_set, y_validation_set)









    Out[163]:





0.55704081632653057



In [164]:

    
clf_.score(X_validation_set, y_validation_set)









    Out[164]:





0.55704081632653057



In [165]:

    
from sklearn.svm import LinearSVC

# svc = LogisticRegression(dual=True, C=30)
svc = LinearSVC()
svc.fit(X_train, y_train)









    Out[165]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)



In [166]:

    
svc.score(X_validation_set, y_validation_set)









    Out[166]:





0.58989795918367349



In [167]:

    
svc.score(X_validation_set, y_validation_set)









    Out[167]:





0.58989795918367349

K-fold testing



In [24]:

    
from sklearn.cross_validation import StratifiedKFold as KFold
from sklearn.metrics import classification_report

namesClasses = ['Satisfaction', 'Content', 'Stability', 'Security',
          'Privacy', 'Pricing', 'Usefulness']

kf = KFold(new_Y, n_folds=5)
y_pred = new_Y * 0

for train, test in kf:
    X_train, X_test, y_train, y_test = X_t[train,:], X_t[test,:], new_Y[train], new_Y[test]
    clf_.fit(X_train, y_train)
    y_pred[test] = clf.predict(X_test)
print classification_report(new_Y, y_pred, target_names=namesClasses)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-24-48afe68a0926> in <module>()
      9 
     10 for train, test in kf:
---> 11     X_train, X_test, y_train, y_test = X_t[train,:], X_t[test,:], new_Y[train], new_Y[test]
     12     clf_.fit(X_train, y_train)
     13     y_pred[test] = clf.predict(X_test)

TypeError: only integer arrays with one element can be converted to an index



In [25]:

    
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB



In [26]:

    
#svc = LogisticRegression(dual=True, C=30)  # 0.781 with n = 19
#svc = LogisticRegression(dual=True, C=8)
svc = MultinomialNB()



In [27]:

    
svc.fit(X_train, y_train)









    Out[27]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [28]:

    
svc.score(X_validation_set, y_validation_set)









    Out[28]:





0.46336734693877552



In [29]:

    
svc.score(X_testing_set, y_testing_set)









    Out[29]:





0.46091836734693875

Create learning curve



In [31]:

    
X_train, X_test, y_train, y_test = train_test_split(X_t, new_Y, test_size=0.2, random_state=0)



In [96]:

    
def learning_curves(clf_):
    p = 0.05
    test_errors = []
    train_errors = []
    while(p < 1.0):
        X_cur_train, _, y_cur_train, _ = train_test_split(X_train, y_train, test_size=(1 - p), random_state=0)
        clf_.fit(X_cur_train, y_cur_train)
        train_error = clf_.score(X_cur_train, y_cur_train)
        print('Training accuracy %s' % train_error)
        train_errors.append(train_error)
        test_error = clf_.score(X_test, y_test)
        print('Test accuracy %s' % test_error)
        test_errors.append(test_error)
        p += 0.05



In [97]:

    
learning_curves(svc)









    



Training accuracy 0.997959183673
Test accuracy 0.412704081633
Training accuracy 0.995918367347
Test accuracy 0.448418367347
Training accuracy 0.99387755102
Test accuracy 0.471275510204
Training accuracy 0.991326530612
Test accuracy 0.492908163265
Training accuracy 0.989931972789
Test accuracy 0.509132653061
Training accuracy 0.988662131519
Test accuracy 0.51887755102
Training accuracy 0.986491739553
Test accuracy 0.526071428571
Training accuracy 0.985542988349
Test accuracy 0.536683673469
Training accuracy 0.984050192758
Test accuracy 0.541632653061
Training accuracy 0.983333333333
Test accuracy 0.548571428571
Training accuracy 0.981631517101
Test accuracy 0.553316326531
Training accuracy 0.979761904762
Test accuracy 0.558826530612
Training accuracy 0.979905808477
Test accuracy 0.563928571429
Training accuracy 0.978085519922
Test accuracy 0.570102040816
Training accuracy 0.976961451247
Test accuracy 0.572397959184
Training accuracy 0.975850340136
Test accuracy 0.576989795918
Training accuracy 0.974829931973
Test accuracy 0.58137755102
Training accuracy 0.97343159486
Test accuracy 0.583928571429
Training accuracy 0.972323666309
Test accuracy 0.586887755102



In [111]:

    
import matplotlib.pyplot as plt
%matplotlib inline

n = range(0, len(train_errors))

plt.plot(n, [1 - i for i in train_errors] , 'g-', n, [1 - i for i in test_errors], 'r-', label='error', linewidth=2)
plt.show()

Create predict function for review inputs



In [98]:

    
def predict_review(clf_, review):
    d = ['Satisfaction', 'Content', 'Stability', 'Security',
          'Privacy', 'Pricing', 'Usefulness']
    
    x = t.transform(v.transform(Counter(tokenize(review))))
    return sorted([(d[i], p) for i, p in enumerate(clf_.predict_proba(x)[0]) if p > 0.2], key=lambda x: -x[1])

Try an example --- "pretty cool app i really like the concept of following stories as opposed to people or sites excited to see where they go with it from here latest update made the ui look much cleaner still not many options for sharing as of now (only reason i didn't give it a 5) but that should hopefully change with future updates"



In [99]:

    
review1 = "pretty cool app i really like the concept of following stories as opposed to people or sites excited to see where they go with it from here  latest update made the ui look much cleaner still not many options for sharing as of now (only reason i didn't give it a 5) but that should hopefully change with future updates;"



In [101]:

    
predict_review(clf_, "this is a super funny app like it's really awesome and gets but maybe you should be able to store your videos in the app so you can look back on funny things get this ap")









    Out[101]:





[('Privacy', 0.43030370976000792)]



In [102]:

    
review2 = "i love this in many ways: easy to use; really helps me track calories; huge database; etc etc downsides:  twitchy ; needs more options for portion sizes (3rds wd be nice); items are excessively repeated; no way to track how you are doing across multiple days w/o going online nevertheless this is well worth the small cost, definitely recommend it"
predict_review(clf_, review2)









    Out[102]:





[('Pricing', 0.50637891531794832), ('Security', 0.20416192055056273)]



In [103]:

    
review3 = 'it lulls my grandsons right to sleep'
predict_review(clf_, review3)









    Out[103]:





[('Privacy', 0.61536982807617091)]



In [104]:

    
review3 = """enjoying this game it's pretty easy to learn yet challenging at the same time the instructions are pretty vague but you can pretty much figure everything out after a few times examplei didn't know that you go tap the machine in the bottom right corner & in a few seconds it dispenses a pill that will add 1 heart to a patient as long as they are not in line to check out i quickly learned this but some of their bonuses aren't explained very well if at all in the tutorial"""
predict_review(clf_, review3)









    Out[104]:





[('Content', 0.29645346728152561), ('Satisfaction', 0.20806045376627161)]



In [105]:

    
review4 = "the idea of this app is great but it is very unstable on the iphone and seems to revert to a web page version that isn't as mobile friendly this app needs better integration with home depot's money online shopping"
predict_review(clf_, review4)









    Out[105]:





[('Stability', 0.45881703274014324), ('Security', 0.22674362573901735)]



In [106]:

    
predict_review(clf_, 'this app sucks. it is not good at anything;  can I get my money back please???!!!')









    Out[106]:





[('Stability', 0.34463497512112207), ('Pricing', 0.29578894881673229)]



In [107]:

    
predict_review(clf_, "i purchased this app months ago it is a favorite recently it is asking me to buy all the songs that we have had in the app what gives? is there something wrong? please help very frustrated customer")









    Out[107]:





[('Stability', 0.44381997396083417), ('Pricing', 0.22184731079836245)]



In [108]:

    
predict_review(clf_, "i love the article synopsis but i feel like the flipboard ux is easier to use the half scroll / page flip gets me once in a while - when the update is slightly longer than my current view")









    Out[108]:





[('Content', 0.51377182343682015)]



In [109]:

    
predict_review(clf_, " these days it helps for apps to filter through news and present the best this app is great and easy to scroll through to get the daily highlights")









    Out[109]:





[('Content', 0.41898571157759673), ('Stability', 0.2013571126793022)]



In [110]:

    
predict_review(clf_,  "i have been looking for a quality alternative to costly wsj and nyt subscriptions and this is it circa has great articles and all the info i need to  carry a conversation with someone on the topic i love it highly recommended")









    Out[110]:





[('Content', 0.38363713376182923)]



In [213]:

    
from sklearn.externals import joblib



In [217]:

    
joblib.dump(clf_, '../../review_classifier/review_classification.pkl')









    Out[217]:





['../../review_classifier/review_classification.pkl',
 '../../review_classifier/review_classification.pkl_01.npy',
 '../../review_classifier/review_classification.pkl_02.npy',
 '../../review_classifier/review_classification.pkl_03.npy',
 '../../review_classifier/review_classification.pkl_04.npy']



In [227]:

    
joblib.dump(v, '../../review_classifier/dict_vectorizer.pkl')









    Out[227]:





['../../review_classifier/dict_vectorizer.pkl']



In [228]:

    
joblib.dump(t, '../../review_classifier/tfidf_transformer.pkl')









    Out[228]:





['../../review_classifier/tfidf_transformer.pkl']



In [ ]: