In [24]:
    
import pymongo
from pymongo import MongoClient
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import getpass
import base64
import xgboost as xgb
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, log_loss, f1_score
from sklearn.base import BaseEstimator
from sklearn.svm import SVC
%matplotlib inline
    
In [2]:
    
client = MongoClient('localhost:27017')
db = client.arXivDB
db.users.count()
    
    Out[2]:
In [3]:
    
def cleaner(doc, stem=False):
    '''Function to clean the text data and prep for further analysis'''
    doc = doc.lower() # turn text to lowercase
    stops = set(stopwords.words("english"))       # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                   # Creating the stemmer model
    doc = re.sub(r"quantum", '', doc)           # removing the word quantum (duh)
    doc = re.sub(r"physics", '', doc)           # removing the word physics (duh)
    doc = re.sub(r"state", '', doc)           # removing the word state (duh)
    doc = re.sub(r'\$.*?\$', 'latexinlineformula', doc) # replacing latex inline formula
    doc = re.sub(r'\\n', ' ', doc) # removing new line character
    doc = re.sub(r'\\\\\"', '', doc)             # removing german double dotted letters
    doc = re.sub(r"</?\w+[^>]*>", '', doc)      # removing html tags
    doc = re.sub("[^a-zA-Z]", ' ', doc)    # removing anythin other alpha-numerical char's and @ and !
    doc = doc.split()                          # Splits the data into individual words 
    doc = [w for w in doc if not w in stops and len(w) > 3]   # Removes stopwords and short length words
    if stem:
        doc = [p_stemmer.stem(i) for i in doc]     # Stemming (reducing words to their root)
    if not len(doc):                            # dealing with comments that are all emojis, stop words or other languages
        doc = ['emptystring']
    # print('text cleaning done!')
    return ' '.join(doc)
    
In [4]:
    
class feature_stacker(BaseEstimator):
    """Stacks several transformer objects to yield concatenated features.
    Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
    to the constructor.
    """
    def __init__(self, transformer_list):
        self.transformer_list = transformer_list
    def get_feature_names(self):
        feature_names = []
        for name, trans in self.transformer_list:
            feature_names.extend(trans.get_feature_names())
        feature_names = [" ".join(w) if isinstance(w, tuple) else w
                            for w in feature_names]
        return np.array(feature_names)
    def fit(self, X, y=None):
        for name, trans in self.transformer_list:
            trans.fit(X, y)
        return self
    def transform(self, X):
        features = []
        for name, trans in self.transformer_list:
            features.append(trans.transform(X))
        issparse = [sparse.issparse(f) for f in features]
        if np.any(issparse):
            features = sparse.hstack(features).tocsr()
        else:
            features = np.hstack(features)
        return features
    def get_params(self, deep=True):
        if not deep:
            return super(feature_stacker, self).get_params(deep=False)
        else:
            out = dict(self.transformer_list)
            for name, trans in self.transformer_list:
                for key, value in trans.get_params(deep=True).items():
                    out['%s__%s' % (name, key)] = value
            return out
    
In [5]:
    
vectorizer_word = TfidfVectorizer(lowercase=False,
                                 analyzer=u'word',
                                 ngram_range=(1, 3),
                                 stop_words='english',
                                 binary=False,
                                 norm=u'l2', 
                                 use_idf=True, 
                                 smooth_idf=True, 
                                 sublinear_tf=True,
                                 min_df=3)
    
In [6]:
    
vectorizer_char = TfidfVectorizer(lowercase=False,
                                 analyzer=u'char',
                                 ngram_range=(1, 5),
                                 stop_words='english',
                                 binary=False,
                                 norm=u'l2', 
                                 use_idf=True, 
                                 smooth_idf=True, 
                                 sublinear_tf=True)
    
In [7]:
    
ft = feature_stacker([
                      ("words", vectorizer_word)])
    
In [8]:
    
select = SelectPercentile(score_func=chi2, percentile=1)
    
In [9]:
    
while True:
    username = input('username: ').lower()
    user = list(db.users.find({'username': username}))
    if not user:
        print('the username doesnt exist, try again')
        user = None
    else:
        pin = base64.b64encode(bytes(str(getpass.getpass('pin: ')), encoding="UTF-8"))
        if not user[0]['pin']==pin:
            print('pin is incorrect, try again')
            user = None
        else:
            break
    
    
In [10]:
    
query_results = list(db.likes.find({'user_id':user[0]['_id']}, {'paper_id':1, '_id':0, 'like':1}))
mypaper_ids = [d['paper_id'] for d in query_results]
mylikes = [d['like'] for d in query_results]
    
In [11]:
    
documents = [cleaner(' '.join([d['title'], d['summary']]), stem=True) for d in db.arXivfeeds.find(
                    {'_id': {'$in': mypaper_ids}}, {'_id':0, 'title':1, 'summary':1}
                )]
    
In [12]:
    
text = ' '.join([d for (d,l) in zip(documents,mylikes) if l])
# Generate a word cloud image
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('wordCloud.png')
    
    
In [13]:
    
le = LabelEncoder()
le.fit(mylikes)
Y = le.transform(mylikes)
    
In [14]:
    
ft.fit(documents)
X = ft.transform(documents)
select.fit(X, Y)
X = select.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=.80)
    
In [15]:
    
X.shape
    
    Out[15]:
In [16]:
    
names = ft.get_feature_names()
names[select.get_support()]
    
    Out[16]:
In [100]:
    
clf = SVC(C=1, 
          kernel='rbf', 
          degree=2, 
          gamma='auto', 
          coef0=0.0, 
          shrinking=True, 
          probability=True, 
          tol=1e-5, 
          cache_size=200, 
          class_weight='balanced', 
          verbose=True, 
          max_iter=1e3, 
          decision_function_shape=None, 
          random_state=None)
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)
    
    
In [17]:
    
##################
#     XGBoost
##################
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)
params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "max_depth": 7,
    "eval_metric": "logloss",
    "eta": 0.1,
    "silent": 1,
    "alpha": 3,
}
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
clf = xgb.train(params, dtrain, 30, evals=watchlist, verbose_eval=True)
pred = clf.predict(xgb.DMatrix(X_test))
    
    
In [101]:
    
a = plt.hist(pred[:,1], bins=50)
    
    
In [103]:
    
print('confusion matrix:')
print(confusion_matrix(y_test, (pred<0.0842)[:,1]))
    
    
In [19]:
    
num_papers_toshow = 20
test_documents = ['___'.join([d['title'], d['summary']]) for d in db.arXivfeeds.aggregate([
            {"$sample": {'size': num_papers_toshow}}
        ])]
    
In [20]:
    
test_X = ft.transform([cleaner(d) for d in test_documents])
test_X = select.transform(test_X)
pred = clf.predict(xgb.DMatrix(test_X))
    
In [21]:
    
test_documents = [x for (y,x) in sorted(zip(pred, test_documents))]
pred = np.sort(pred)
    
In [22]:
    
for j in range(len(pred)):
    print(pred[j])
    print(test_documents[j].split('___')[0])
    
    
In [ ]: