In [2]:
from collections import Counter
import json
In [8]:
%%time
reviews = []
with open("yelp_academic_dataset_review.json") as f:
i = 0
for line in f:
if i % 1000000 == 0:
print(i)
i += 1
reviews.append(json.loads(line))
In [9]:
print(reviews[0])
In [81]:
by_author = {} # author : "review 1\n review 2\n review 3"
for review in reviews:
uid = review['user_id']
if uid in by_author:
by_author[uid] += "\n{}".format(review['text'])
else:
by_author[uid] = "{}".format(review['text'])
In [82]:
len(by_author)
Out[82]:
In [83]:
# Get authors who have written at least 10k characters in total
by_author = {a: by_author[a] for a in by_author if len(by_author[a]) > 6010}
In [85]:
len(by_author)
Out[85]:
In [86]:
known_texts = []
unknown_texts = []
# skip some words in the middle so we can't infer anything by 'matching' broken texts
for a in by_author:
known_texts += [by_author[a][:3000]]
unknown_texts += [by_author[a][3010:6010]]
In [91]:
len(unknown_texts)
Out[91]:
In [92]:
total = 71300
half = int(total/2)
known_same = known_texts[:half]
unknown_same = unknown_texts[:half]
known_diff = known_texts[half:total]
unknown_diff = unknown_texts[half:total]
# move unknown diffs up by one
unknown_diff = unknown_diff[1:] + [unknown_diff[0]]
knowns = known_same + known_diff
unknowns = unknown_same + unknown_diff
In [93]:
len(knowns)
Out[93]:
In [94]:
n = int(len(knowns)/2)
labels = ([1] * n) + ([0] * n)
In [95]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
char_tf = TfidfVectorizer(analyzer='char', ngram_range=(2,3), min_df=0.01, lowercase=False)
word_tf = TfidfVectorizer(ngram_range=(1,2), lowercase=False, min_df=0.01)
vectorizer = FeatureUnion([
('char', char_tf),
('word', word_tf)
])
vectorizer.fit(knowns + unknowns)
In [96]:
%%time
known_vecs = vectorizer.transform(knowns)
print(".")
unknown_vecs = vectorizer.transform(unknowns)
In [97]:
len(knowns)
Out[97]:
In [106]:
known_vecs.shape
Out[106]:
In [99]:
len(labels)
Out[99]:
In [100]:
from random import shuffle
indices = list(range(len(labels)))
shuffle(indices)
indices[:10]
Out[100]:
In [101]:
len(indices)
Out[101]:
In [102]:
import numpy as np
labels = np.array(labels)
In [103]:
train_indices = indices[:60000]
test_indices = indices[60000:]
known_train = known_vecs[train_indices, :]
unknown_train = unknown_vecs[train_indices, :]
train_labels = labels[train_indices]
known_test = known_vecs[test_indices, :]
unknown_test = unknown_vecs[test_indices, :]
test_labels = labels[test_indices]
In [1]:
len(test_indices)
In [104]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
In [107]:
%%time
train_pairs = np.abs(known_train - unknown_train)
test_pairs = np.abs(known_test - unknown_test)
svm = LinearSVC()
svm.fit(train_pairs, train_labels)
preds = svm.predict(test_pairs)
print(classification_report(test_labels, preds))
In [ ]: