In [24]:
import pymongo
from pymongo import MongoClient
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import getpass
import base64
import xgboost as xgb
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, log_loss, f1_score
from sklearn.base import BaseEstimator
from sklearn.svm import SVC
%matplotlib inline
In [2]:
client = MongoClient('localhost:27017')
db = client.arXivDB
db.users.count()
Out[2]:
In [3]:
def cleaner(doc, stem=False):
'''Function to clean the text data and prep for further analysis'''
doc = doc.lower() # turn text to lowercase
stops = set(stopwords.words("english")) # Creating a set of Stopwords
p_stemmer = PorterStemmer() # Creating the stemmer model
doc = re.sub(r"quantum", '', doc) # removing the word quantum (duh)
doc = re.sub(r"physics", '', doc) # removing the word physics (duh)
doc = re.sub(r"state", '', doc) # removing the word state (duh)
doc = re.sub(r'\$.*?\$', 'latexinlineformula', doc) # replacing latex inline formula
doc = re.sub(r'\\n', ' ', doc) # removing new line character
doc = re.sub(r'\\\\\"', '', doc) # removing german double dotted letters
doc = re.sub(r"</?\w+[^>]*>", '', doc) # removing html tags
doc = re.sub("[^a-zA-Z]", ' ', doc) # removing anythin other alpha-numerical char's and @ and !
doc = doc.split() # Splits the data into individual words
doc = [w for w in doc if not w in stops and len(w) > 3] # Removes stopwords and short length words
if stem:
doc = [p_stemmer.stem(i) for i in doc] # Stemming (reducing words to their root)
if not len(doc): # dealing with comments that are all emojis, stop words or other languages
doc = ['emptystring']
# print('text cleaning done!')
return ' '.join(doc)
In [4]:
class feature_stacker(BaseEstimator):
"""Stacks several transformer objects to yield concatenated features.
Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
to the constructor.
"""
def __init__(self, transformer_list):
self.transformer_list = transformer_list
def get_feature_names(self):
feature_names = []
for name, trans in self.transformer_list:
feature_names.extend(trans.get_feature_names())
feature_names = [" ".join(w) if isinstance(w, tuple) else w
for w in feature_names]
return np.array(feature_names)
def fit(self, X, y=None):
for name, trans in self.transformer_list:
trans.fit(X, y)
return self
def transform(self, X):
features = []
for name, trans in self.transformer_list:
features.append(trans.transform(X))
issparse = [sparse.issparse(f) for f in features]
if np.any(issparse):
features = sparse.hstack(features).tocsr()
else:
features = np.hstack(features)
return features
def get_params(self, deep=True):
if not deep:
return super(feature_stacker, self).get_params(deep=False)
else:
out = dict(self.transformer_list)
for name, trans in self.transformer_list:
for key, value in trans.get_params(deep=True).items():
out['%s__%s' % (name, key)] = value
return out
In [5]:
vectorizer_word = TfidfVectorizer(lowercase=False,
analyzer=u'word',
ngram_range=(1, 3),
stop_words='english',
binary=False,
norm=u'l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
min_df=3)
In [6]:
vectorizer_char = TfidfVectorizer(lowercase=False,
analyzer=u'char',
ngram_range=(1, 5),
stop_words='english',
binary=False,
norm=u'l2',
use_idf=True,
smooth_idf=True,
sublinear_tf=True)
In [7]:
ft = feature_stacker([
("words", vectorizer_word)])
In [8]:
select = SelectPercentile(score_func=chi2, percentile=1)
In [9]:
while True:
username = input('username: ').lower()
user = list(db.users.find({'username': username}))
if not user:
print('the username doesnt exist, try again')
user = None
else:
pin = base64.b64encode(bytes(str(getpass.getpass('pin: ')), encoding="UTF-8"))
if not user[0]['pin']==pin:
print('pin is incorrect, try again')
user = None
else:
break
In [10]:
query_results = list(db.likes.find({'user_id':user[0]['_id']}, {'paper_id':1, '_id':0, 'like':1}))
mypaper_ids = [d['paper_id'] for d in query_results]
mylikes = [d['like'] for d in query_results]
In [11]:
documents = [cleaner(' '.join([d['title'], d['summary']]), stem=True) for d in db.arXivfeeds.find(
{'_id': {'$in': mypaper_ids}}, {'_id':0, 'title':1, 'summary':1}
)]
In [12]:
text = ' '.join([d for (d,l) in zip(documents,mylikes) if l])
# Generate a word cloud image
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('wordCloud.png')
In [13]:
le = LabelEncoder()
le.fit(mylikes)
Y = le.transform(mylikes)
In [14]:
ft.fit(documents)
X = ft.transform(documents)
select.fit(X, Y)
X = select.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=.80)
In [15]:
X.shape
Out[15]:
In [16]:
names = ft.get_feature_names()
names[select.get_support()]
Out[16]:
In [100]:
clf = SVC(C=1,
kernel='rbf',
degree=2,
gamma='auto',
coef0=0.0,
shrinking=True,
probability=True,
tol=1e-5,
cache_size=200,
class_weight='balanced',
verbose=True,
max_iter=1e3,
decision_function_shape=None,
random_state=None)
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)
In [17]:
##################
# XGBoost
##################
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)
params = {
"objective": "binary:logistic",
"booster": "gbtree",
"max_depth": 7,
"eval_metric": "logloss",
"eta": 0.1,
"silent": 1,
"alpha": 3,
}
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
clf = xgb.train(params, dtrain, 30, evals=watchlist, verbose_eval=True)
pred = clf.predict(xgb.DMatrix(X_test))
In [101]:
a = plt.hist(pred[:,1], bins=50)
In [103]:
print('confusion matrix:')
print(confusion_matrix(y_test, (pred<0.0842)[:,1]))
In [19]:
num_papers_toshow = 20
test_documents = ['___'.join([d['title'], d['summary']]) for d in db.arXivfeeds.aggregate([
{"$sample": {'size': num_papers_toshow}}
])]
In [20]:
test_X = ft.transform([cleaner(d) for d in test_documents])
test_X = select.transform(test_X)
pred = clf.predict(xgb.DMatrix(test_X))
In [21]:
test_documents = [x for (y,x) in sorted(zip(pred, test_documents))]
pred = np.sort(pred)
In [22]:
for j in range(len(pred)):
print(pred[j])
print(test_documents[j].split('___')[0])
In [ ]: