In [1]:
import re
import sqlite3
conn = sqlite3.connect('pmcv3-full.db')
c = conn.cursor()

In [2]:
c.execute('''SELECT pmid, abstract from abstracts''')
abstracts = []
pmids = []
while True:
    fetch = c.fetchone()
    if fetch == None:
        break
    abstract = fetch[1]
    pmids.append(fetch[0])
    # Next line cleans seciton headings, e.g. Objectives, Results, etc., because these always appear like
    # ObjectivesEmotion, ResultsDuring, BACKGROUNDInsulin-like and so on
    if abstract != None:
        abstract = re.sub(r'([A-Z]+[a-z]+)([A-Z][a-z]+)', r' \2', abstract).lstrip()
    else:
        abstract = ''
    abstracts.append(abstract)

In [ ]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')

def tokenize_only(text):
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.4, max_features=10000000,
                                   min_df=1, stop_words=stopwords,
                                   use_idf=True, tokenizer=tokenize_only, ngram_range=(1,2),
                                   sublinear_tf=True)
%time tfidf_matrix = tfidf_vectorizer.fit_transform(abstracts)
print(tfidf_matrix.shape)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
print tfidf_matrix.shape


(927541, 10000000)

In [15]:
%time similar = cosine_similarity(tfidf_matrix[0], tfidf_matrix)


CPU times: user 49.4 s, sys: 6.93 s, total: 56.4 s
Wall time: 56.6 s

In [14]:
c.execute('''SELECT pmid FROM highlycitedpmids''')
pmidlist = [entry[0] for entry in c.fetchall()]
print len(pmidlist) #1076


1076

In [ ]:
%%time
similarpmids = []
for pmid in pmidlist:
    ix = pmids.index(pmid)
    similar = cosine_similarity(tfidf_matrix[ix], tfidf_matrix)
    similar = similar.argsort().flatten()[:-12:-1][1:] # top 10, excluding #1 since #1 is self
    simlist = []
    for entry in similar: simlist.append(pmids[entry])
    similarpmids.append(simlist)

In [22]:
c.execute('''DROP TABLE similarpubs''')
c.execute('''CREATE TABLE similarpubs (pmid integer, similar text, PRIMARY KEY (pmid))''')
for i, entry in enumerate(similarpmids):
    c.execute("INSERT INTO similarpubs (pmid, similar) VALUES (?, ?)", (pmidlist[i], str(entry).strip('[]')))
try:
    c.execute('''COMMIT''')
except:
    pass

In [ ]: