notebook.community

Edit and run



In [2]:

    
from collections import Counter

import json



In [8]:

    
%%time
reviews = []
with open("yelp_academic_dataset_review.json") as f:
    i = 0
    for line in f:
        if i % 1000000 == 0:
            print(i)
        i += 1
        reviews.append(json.loads(line))









    



0
1000000
2000000
3000000
4000000
CPU times: user 43 s, sys: 4.22 s, total: 47.3 s
Wall time: 1min 27s



In [9]:

    
print(reviews[0])









    



{'user_id': 'KpkOkG6RIf4Ra25Lhhxf1A', 'stars': 5, 'review_id': 'NxL8SIC5yqOdnlXCg18IBg', 'text': "If you enjoy service by someone who is as competent as he is personable, I would recommend Corey Kaplan highly. The time he has spent here has been very productive and working with him educational and enjoyable. I hope not to need him again (though this is highly unlikely) but knowing he is there if I do is very nice. By the way, I'm not from El Centro, CA. but Scottsdale, AZ.", 'useful': 0, 'date': '2011-10-10', 'cool': 0, 'funny': 0, 'type': 'review', 'business_id': '2aFiy99vNLklCx3T_tGS9A'}



In [81]:

    
by_author = {} # author : "review 1\n review 2\n review 3"
for review in reviews:
    uid = review['user_id']
    if uid in by_author:
        by_author[uid] += "\n{}".format(review['text'])
    else:
        by_author[uid] = "{}".format(review['text'])



In [82]:

    
len(by_author)









    Out[82]:





1029432



In [83]:

    
# Get authors who have written at least 10k characters in total
by_author = {a: by_author[a] for a in by_author if len(by_author[a]) > 6010}



In [85]:

    
len(by_author)









    Out[85]:





71346



In [86]:

    
known_texts = []
unknown_texts = []

# skip some words in the middle so we can't infer anything by 'matching' broken texts
for a in by_author:
    known_texts += [by_author[a][:3000]]
    unknown_texts += [by_author[a][3010:6010]]



In [91]:

    
len(unknown_texts)









    Out[91]:





71346



In [92]:

    
total = 71300
half = int(total/2)

known_same = known_texts[:half]
unknown_same = unknown_texts[:half]

known_diff = known_texts[half:total]
unknown_diff = unknown_texts[half:total]

# move unknown diffs up by one
unknown_diff = unknown_diff[1:] + [unknown_diff[0]]

knowns = known_same + known_diff
unknowns = unknown_same + unknown_diff



In [93]:

    
len(knowns)









    Out[93]:





71300



In [94]:

    
n = int(len(knowns)/2)
labels = ([1] * n) + ([0] * n)



In [95]:

    
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
char_tf = TfidfVectorizer(analyzer='char', ngram_range=(2,3), min_df=0.01, lowercase=False)
word_tf = TfidfVectorizer(ngram_range=(1,2), lowercase=False, min_df=0.01)
vectorizer = FeatureUnion([
    ('char', char_tf),
    ('word', word_tf)
])

vectorizer.fit(knowns + unknowns)









    



CPU times: user 11min 16s, sys: 9.25 s, total: 11min 26s
Wall time: 11min 26s



In [96]:

    
%%time
known_vecs = vectorizer.transform(knowns)
print(".")
unknown_vecs = vectorizer.transform(unknowns)









    



.
CPU times: user 11min 54s, sys: 11.4 s, total: 12min 5s
Wall time: 12min 6s



In [97]:

    
len(knowns)









    Out[97]:





71300



In [106]:

    
known_vecs.shape









    Out[106]:





(71300, 17662)



In [99]:

    
len(labels)









    Out[99]:





71300



In [100]:

    
from random import shuffle
indices = list(range(len(labels)))
shuffle(indices)
indices[:10]









    Out[100]:





[16058, 58018, 4324, 11929, 7843, 46526, 26537, 42836, 35330, 17246]



In [101]:

    
len(indices)









    Out[101]:





71300



In [102]:

    
import numpy as np
labels = np.array(labels)



In [103]:

    
train_indices = indices[:60000]
test_indices = indices[60000:]

known_train = known_vecs[train_indices, :]
unknown_train = unknown_vecs[train_indices, :]
train_labels = labels[train_indices]

known_test = known_vecs[test_indices, :]
unknown_test = unknown_vecs[test_indices, :]
test_labels = labels[test_indices]



In [1]:

    
len(test_indices)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-4d35395c70aa> in <module>()
----> 1 len(test_indices)

NameError: name 'test_indices' is not defined



In [104]:

    
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score



In [107]:

    
%%time
train_pairs = np.abs(known_train - unknown_train)
test_pairs = np.abs(known_test - unknown_test)
svm = LinearSVC()
svm.fit(train_pairs, train_labels)
preds = svm.predict(test_pairs)
print(classification_report(test_labels, preds))









    



             precision    recall  f1-score   support

          0       0.92      0.92      0.92      5702
          1       0.92      0.92      0.92      5598

avg / total       0.92      0.92      0.92     11300

CPU times: user 29.6 s, sys: 1.88 s, total: 31.4 s
Wall time: 31.5 s



In [ ]: