Solution: We will use TextRank for automatic summarization of medical articles. NIH's (National Institues for Health) PubMed repository consists of links to hundreds of thousands of medical articles. We will use articles relevant to various types of cancer. We will use the abstract of each article as the "ground truth". We will apply the TextRank algorithm to only the body of the PubMed article without the abstract to generate an extractive summary. We will use a Java based implementation of ROUGE software to evaluate the precision, recall and F1 score of extractive summary with respect to the ground truth.
Step 1: Import required modules
In [1]:
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import networkx as nx
import re
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
# -*- coding: utf-8 -*-
Step 2: Generate a list of documents
In [2]:
urls = []
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1994795/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=1994795')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC314300/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=314300')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4383356/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4383356')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4596899/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4596899')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4303126/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4303126')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4637461/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4637461')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4690355/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4690355')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3505152/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=3505152')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3976810/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=3976810')
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4061037/
urls.append('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=4061037')
Step 3: Preprocess the documents
In [3]:
documents = []
abstracts = []
texts = []
print 'Preprocessing documents. This may take few minutes ...'
for i, url in enumerate(urls):
print 'Preprocessing document %d ...' % (i+1)
# Download the document
my_url = urllib2.urlopen(url)
raw_doc = BeautifulSoup(my_url.read(), 'xml')
documents.append(raw_doc)
# Extract the cleaned abstract
raw_abstract = raw_doc.abstract
my_abstract = re.sub(r'<\/?\w+>', r' ', str(raw_abstract)) # remove xml tags
abstracts.append(my_abstract)
# Extract the cleaned text
text = raw_doc.body
text = re.sub(r'\\n', r' ', str(text)) # remove newline characters
text = re.sub(r'<[^>]+>', r' ', str(text)) # remove xml tags
text = re.sub(r'\[[^\[^\]]+\]', r' ', str(text)) # remove references
text = re.sub(r'\[', r' ', str(text)) # remove any remaining [
text = re.sub(r'\]', r' ', str(text)) # remove any remaining ]
text = re.sub(r'[\s]{2,}', r' ', str(text)) # remove more than a single blank space
text = re.sub(r'\.\s+,\s+\S', r' ', str(text)) # remove , after a period
text = text.decode('utf-8')
texts.append(text)
print 'All documents preprocessed successfully.'
print 'We have %d documents with %d abstracts and %d texts.' % (len(documents), len(abstracts), len(texts))
assert len(documents) == len(abstracts)
assert len(documents) == len(texts)
Step 4: Split the documents into sentences
In [4]:
punkttokenizer = PunktSentenceTokenizer()
text_sentences = []
for text in texts:
sentences = []
seen = set()
for sentence in punkttokenizer.tokenize(text):
sentences.append(sentence)
text_sentences.append(sentences)
Step 5: Count the term frequency for sentences
In [5]:
tf_matrices = []
tfidf_matrices = []
cosine_similarity_matrices = []
print 'Calculating sentence simiarities. This may take few minutes ...'
for i, sentences in enumerate(text_sentences):
print 'Calculating sentence simiarities of document %d ...' % (i+1)
tf_matrix = CountVectorizer().fit_transform(sentences)
tf_matrices.append(tf_matrix)
tfidf_matrix = TfidfTransformer().fit_transform(tf_matrix)
tfidf_matrices.append(tfidf_matrix)
cosine_similarity_matrix = tfidf_matrix * tfidf_matrix.T
cosine_similarity_matrices.append(cosine_similarity_matrix)
print 'All documents processed successfully.'
print 'We have %d documents with %d tf_matrices %d tfidf_matrices and %d cosine_similarity_matrices.' \
% (len(documents), len(tf_matrices), len(tfidf_matrices), len(cosine_similarity_matrices))
assert len(documents) == len(tf_matrices)
assert len(documents) == len(tfidf_matrices)
assert len(documents) == len(cosine_similarity_matrices)
Step 6: Calculate TextRank
In [6]:
similarity_graphs = []
graph_ranks = []
highest_ranks = []
lowest_ranks = []
print 'Calculating TextRanks. This may take few minutes ...'
for i, cosine_similarity_matrix in enumerate(cosine_similarity_matrices):
print 'Calculating TextRanks of document %d ...' % (i+1)
similarity_graph = nx.from_scipy_sparse_matrix(cosine_similarity_matrix)
similarity_graphs.append(similarity_graph)
ranks = nx.pagerank(similarity_graph)
graph_ranks.append(ranks)
highest = sorted(((ranks[j],s) for j,s in enumerate(text_sentences[i])), reverse=True)
highest_ranks.append(highest)
lowest = sorted(((ranks[j],s) for j,s in enumerate(text_sentences[i])), reverse=False)
lowest_ranks.append(lowest)
print 'All documents processed successfully.'
print 'We have %d documents with %d similarity_graphs %d graph_ranks and %d highest_ranks.' \
% (len(documents), len(similarity_graphs), len(graph_ranks), len(highest_ranks))
assert len(documents) == len(similarity_graphs)
assert len(documents) == len(graph_ranks)
assert len(documents) == len(highest_ranks)
Step 7: Save extractive summaries
In [7]:
print 'Saving extractive summaries. This may take a few minutes ...'
for i, highest in enumerate(highest_ranks):
print 'Writing extractive summary for document %d ...' % (i+1)
out_file = '\\TextRank\\system\\article%d_system1.txt' % (i+1)
with open(out_file, 'w') as f:
for i in range(5):
f.write((highest[i][1] + '\n').encode('utf-8'))
print 'All documents processed successfully.'
Step 8: Save ground truths.
In [8]:
print 'Saving ground truths. This may take a few minutes ...'
for i, abstract in enumerate(abstracts):
print 'Writing ground truth for document %d ...' % (i+1)
out_file = '\\TextRank\\reference\\article%d_reference1.txt' % (i+1)
with open(out_file, 'w') as f:
f.write(abstract.strip() + '\n')
print 'All documents processed successfully.'
Step 9: Calculate ROUGE score
In [9]:
%cd C:\ROUGE
!java -jar rouge2.0_0.2.jar
In [10]:
df = pd.read_csv('results.csv')
print df.sort_values('Avg_F-Score', ascending=False)
In [ ]: