In [1]:
import re
import sqlite3
conn = sqlite3.connect('pmcv3-full.db')
c = conn.cursor()
In [2]:
c.execute('''SELECT pmid, abstract from abstracts''')
abstracts = []
pmids = []
while True:
fetch = c.fetchone()
if fetch == None:
break
abstract = fetch[1]
pmids.append(fetch[0])
# Next line cleans seciton headings, e.g. Objectives, Results, etc., because these always appear like
# ObjectivesEmotion, ResultsDuring, BACKGROUNDInsulin-like and so on
if abstract != None:
abstract = re.sub(r'([A-Z]+[a-z]+)([A-Z][a-z]+)', r' \2', abstract).lstrip()
else:
abstract = ''
abstracts.append(abstract)
In [ ]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
def tokenize_only(text):
tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.4, max_features=10000000,
min_df=1, stop_words=stopwords,
use_idf=True, tokenizer=tokenize_only, ngram_range=(1,2),
sublinear_tf=True)
%time tfidf_matrix = tfidf_vectorizer.fit_transform(abstracts)
print(tfidf_matrix.shape)
In [13]:
from sklearn.metrics.pairwise import cosine_similarity
print tfidf_matrix.shape
In [15]:
%time similar = cosine_similarity(tfidf_matrix[0], tfidf_matrix)
In [14]:
c.execute('''SELECT pmid FROM highlycitedpmids''')
pmidlist = [entry[0] for entry in c.fetchall()]
print len(pmidlist) #1076
In [ ]:
%%time
similarpmids = []
for pmid in pmidlist:
ix = pmids.index(pmid)
similar = cosine_similarity(tfidf_matrix[ix], tfidf_matrix)
similar = similar.argsort().flatten()[:-12:-1][1:] # top 10, excluding #1 since #1 is self
simlist = []
for entry in similar: simlist.append(pmids[entry])
similarpmids.append(simlist)
In [22]:
c.execute('''DROP TABLE similarpubs''')
c.execute('''CREATE TABLE similarpubs (pmid integer, similar text, PRIMARY KEY (pmid))''')
for i, entry in enumerate(similarpmids):
c.execute("INSERT INTO similarpubs (pmid, similar) VALUES (?, ?)", (pmidlist[i], str(entry).strip('[]')))
try:
c.execute('''COMMIT''')
except:
pass
In [ ]: