In [1]:
import os 
import csv
import nltk
import heapq
import gensim
import string
import itertools

from operator import itemgetter

Data Loading

Loads the corpus from a CSV file containing metadata and abstracts about the paper, then performs some lightweight tokenization of the abstracts in the corpus.


In [2]:
DATA = "data/ieee-xplore.csv"

def load_data(path=DATA):
    with open(path, 'r') as f:
        reader = csv.DictReader(f) 
        for row in reader:
            row['Tokenized Abstract'] = tokenize(row['Abstract'])
            yield row
        

def tokenize(text):
    return [
        list(nltk.wordpunct_tokenize(sent))
        for sent in nltk.sent_tokenize(text)
    ]

corpus = list(load_data())

In [3]:
corpus[0].keys()


Out[3]:
odict_keys(['Document Title', 'Authors', 'Author Affiliations', 'Publication Title', 'Date Added To Xplore', 'Publication_Year', 'Volume', 'Issue', 'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI', 'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms', 'INSPEC Controlled Terms', 'INSPEC Non-Controlled Terms', 'Mesh_Terms', 'Article Citation Count', 'Reference Count', 'Copyright Year', 'License', 'Online Date', 'Issue Date', 'Meeting Date', 'Publisher', 'Document Identifier', '', 'Tokenized Abstract'])

Key Phrase Extraction

Extract key phrases from abstracts/documents ranked by TF-IDF


In [4]:
PUNCT     = set(string.punctuation)
STOPWORDS = set(nltk.corpus.stopwords.words('english'))
GRAMMAR   = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS  = set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])


def candidates(abstract, chunks=True, grammar=GRAMMAR, tags=GOODTAGS):
    """
    Extracts the candidate terms (chunks or individual workds).
    """
    tagged = nltk.pos_tag_sents(abstract)

    if chunks:
        # Extract candidate chunks
        chunker = nltk.chunk.regexp.RegexpParser(grammar)
        chunks  = list(itertools.chain.from_iterable(
            nltk.chunk.tree2conlltags(chunker.parse(sent))
            for sent in tagged
        ))

        def object_filter(group):
            word, pos, chunk = group
            return chunk != 'O'

        # Rejoin candidates as text
        candidates = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in itertools.groupby(chunks, object_filter) if key
        ]

        # Filter stopwords and puntuation
        return [
            cand for cand in candidates
            if (
                cand not in STOPWORDS and
                not all(char in PUNCT for char in cand))
        ]

    else:
        tagged = itertools.chain.from_iterable(tagged)

        # Return lower case and filtered words
        return [
            word.lower() for word, tag in tagged
            if (
                tag in tags and word.lower() not in STOPWORDS
                and not all(char in PUNCT for char in word)
            )
        ]


def score_keyphrases_by_tfidf(texts, fileids, N=20):
    """
    Computes the TF-IDF scoring of the corpus given a list of lists of
    candidate terms where each list represents a single document.
    """

    id2word = gensim.corpora.Dictionary(texts)
    corpus  = [id2word.doc2bow(text) for text in texts]
    tfidf   = gensim.models.TfidfModel(corpus)
    scored  = tfidf[corpus]

    output  = []
    for idx, doc in enumerate(scored):
        output.append(u"Document '{}' key phrases:".format(fileids[idx]))

        # Get the top N terms by TF-IDF score
        for wid, score in heapq.nlargest(N, doc, key=itemgetter(1)):
            output.append(u"{:0.3f}: {}".format(score, id2word[wid]))

        output.extend([u'',u''])

    return u"\n".join(output)

In [6]:
fileids = [doc['Document Title'] for doc in corpus]
with open('data/keyphrases.txt', 'w') as f:
    f.write(
        score_keyphrases_by_tfidf([candidates(doc['Tokenized Abstract']) for doc in corpus], fileids, 5)
    )

Clustering Documents


In [4]:
corpus[0].keys()


Out[4]:
odict_keys(['Document Title', 'Authors', 'Author Affiliations', 'Publication Title', 'Date Added To Xplore', 'Publication_Year', 'Volume', 'Issue', 'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI', 'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms', 'INSPEC Controlled Terms', 'INSPEC Non-Controlled Terms', 'Mesh_Terms', 'Article Citation Count', 'Reference Count', 'Copyright Year', 'License', 'Online Date', 'Issue Date', 'Meeting Date', 'Publisher', 'Document Identifier', '', 'Tokenized Abstract'])

In [5]:
len(corpus)


Out[5]:
9818

In [6]:
from sklearn.pipeline import Pipeline 
from sklearn.cluster import MiniBatchKMeans 
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
model = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('kmeans', MiniBatchKMeans()), 
])

model.fit([doc['Abstract'] for doc in corpus])


Out[7]:
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...sters=8,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0))])

In [8]:
model.get_params()


Out[8]:
{'kmeans': MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
         init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
         n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
         verbose=0),
 'kmeans__batch_size': 100,
 'kmeans__compute_labels': True,
 'kmeans__init': 'k-means++',
 'kmeans__init_size': None,
 'kmeans__max_iter': 100,
 'kmeans__max_no_improvement': 10,
 'kmeans__n_clusters': 8,
 'kmeans__n_init': 3,
 'kmeans__random_state': None,
 'kmeans__reassignment_ratio': 0.01,
 'kmeans__tol': 0.0,
 'kmeans__verbose': 0,
 'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('kmeans',
   MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
           init_size=None, max_iter=100, max_no_improvement=10, n_clusters=8,
           n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
           verbose=0))],
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.int64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': None,
 'tfidf__smooth_idf': True,
 'tfidf__stop_words': None,
 'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': False,
 'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__tokenizer': None,
 'tfidf__use_idf': True,
 'tfidf__vocabulary': None}

In [9]:
# model.steps[-1]
centers = model.named_steps['kmeans'].cluster_centers_

In [10]:
from heapq import nlargest 
from operator import itemgetter 

def rank_center_terms(k=0, n=10):
    words = model.named_steps['tfidf'].inverse_transform([centers[k]])[0]
    return nlargest(n, list(zip(words, centers[k])), key=itemgetter(1))


rank_center_terms()


Out[10]:
[('accuracy', 0.0),
 ('accurate', 0.0),
 ('active', 0.0),
 ('algorithm', 0.0),
 ('all', 0.0),
 ('an', 0.0),
 ('and', 0.0),
 ('are', 0.0),
 ('as', 0.0),
 ('at', 0.0)]

In [11]:
rank_center_terms(1)


Out[11]:
[('exaction', 0.062358596251125074),
 ('probing', 0.03465919098497553),
 ('recycling', 0.02756994324354016),
 ('essential', 0.025343080418012906),
 ('tag', 0.02505765241010308),
 ('protection', 0.024887571634227258),
 ('studies', 0.02275766343521735),
 ('μw', 0.02042169191535145),
 ('highbackground', 0.01869649295869566),
 ('founded', 0.017089100792631574)]

In [12]:
model.named_steps['kmeans'].labels_


Out[12]:
array([7, 7, 7, ..., 3, 5, 1], dtype=int32)

In [13]:
def titles_for_cluster(k=0):
    for idx, label in enumerate(model.named_steps['kmeans'].labels_):
        if label == k:
            print(corpus[idx]['Document Title'])
            
#titles_for_cluster()

In [14]:
%matplotlib notebook 

import seaborn as sns 
import matplotlib.pyplot as plt 

sns.distplot(model.named_steps['kmeans'].labels_)


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x119eb52b0>

In [15]:
import numpy as np
from sklearn.cluster import SpectralClustering
from yellowbrick.cluster import SilhouetteVisualizer 

X = TfidfVectorizer().fit_transform([doc['Abstract'] for doc in corpus])

In [16]:
oz = SilhouetteVisualizer(MiniBatchKMeans())
oz.fit(X)
oz.poof()



In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin 

class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.column = column 
    
    def fit(self, X):
        return self 
        
    def transform(self, X):
        return [
            doc[self.column] for doc in X 
        ]


class Tokenizer(BaseEstimator, TransformerMixin):
    
    def __init__(self, sep=";"):
        self.sep = sep 
    
    def fit(self, X):
        return self 
    
    def transform(self, X):
        return [
            {key: 1 for key in doc.split(self.sep)}
            for doc in X 
        ]

In [62]:
from sklearn.decomposition import TruncatedSVD 

model = Pipeline([
    ('cols', FeatureUnion([
        ('abstract', Pipeline([
            ('select', ColumnSelector('Abstract')),
            ('tfidf', TfidfVectorizer()), 
        ])),
        ('key terms', Pipeline([
            ('select', ColumnSelector('Author Keywords')),
            ('tokenize', Tokenizer()), 
            ('vect', DictVectorizer()), 
        ]))
    ])),
    ('svd', TruncatedSVD(1000))
])

docs = model.fit_transform(corpus)

In [64]:
docs[0].shape


Out[64]:
(1000,)

LDA


In [17]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [21]:
tfvec = CountVectorizer(strip_accents = 'unicode', stop_words = 'english', lowercase = True, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.5, min_df = 10)
dtm_tf = tfvec.fit_transform(ColumnSelector('Abstract').fit_transform(corpus))

In [25]:
model = Pipeline([
    ('select', ColumnSelector('Abstract')),
    ('tfidf', TfidfVectorizer(**tfvec.get_params())),
    ('lda', LatentDirichletAllocation())
])

model.fit(corpus)


/Users/benjamin/.pyenv/versions/3.6.2/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/decomposition/online_lda.py:536: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.
  DeprecationWarning)
Out[25]:
Pipeline(memory=None,
     steps=[('select', ColumnSelector(column='Abstract')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=10,
        ngram_range=(1, 1), ...ol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0))])

In [26]:
pyLDAvis.sklearn.prepare(model.named_steps['lda'], dtm_tf, tfvec)


Out[26]: