In [1]:
import os
import csv
import nltk
import heapq
import gensim
import string
import itertools
from operator import itemgetter
In [2]:
DATA = "data/ieee-xplore.csv"
def load_data(path=DATA):
with open(path, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
row['Tokenized Abstract'] = tokenize(row['Abstract'])
yield row
def tokenize(text):
return [
list(nltk.wordpunct_tokenize(sent))
for sent in nltk.sent_tokenize(text)
]
corpus = list(load_data())
In [3]:
corpus[0].keys()
Out[3]:
In [4]:
PUNCT = set(string.punctuation)
STOPWORDS = set(nltk.corpus.stopwords.words('english'))
GRAMMAR = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
GOODTAGS = set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])
def candidates(abstract, chunks=True, grammar=GRAMMAR, tags=GOODTAGS):
"""
Extracts the candidate terms (chunks or individual workds).
"""
tagged = nltk.pos_tag_sents(abstract)
if chunks:
# Extract candidate chunks
chunker = nltk.chunk.regexp.RegexpParser(grammar)
chunks = list(itertools.chain.from_iterable(
nltk.chunk.tree2conlltags(chunker.parse(sent))
for sent in tagged
))
def object_filter(group):
word, pos, chunk = group
return chunk != 'O'
# Rejoin candidates as text
candidates = [
" ".join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(chunks, object_filter) if key
]
# Filter stopwords and puntuation
return [
cand for cand in candidates
if (
cand not in STOPWORDS and
not all(char in PUNCT for char in cand))
]
else:
tagged = itertools.chain.from_iterable(tagged)
# Return lower case and filtered words
return [
word.lower() for word, tag in tagged
if (
tag in tags and word.lower() not in STOPWORDS
and not all(char in PUNCT for char in word)
)
]
def score_keyphrases_by_tfidf(texts, fileids, N=20):
"""
Computes the TF-IDF scoring of the corpus given a list of lists of
candidate terms where each list represents a single document.
"""
id2word = gensim.corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = gensim.models.TfidfModel(corpus)
scored = tfidf[corpus]
output = []
for idx, doc in enumerate(scored):
output.append(u"Document '{}' key phrases:".format(fileids[idx]))
# Get the top N terms by TF-IDF score
for wid, score in heapq.nlargest(N, doc, key=itemgetter(1)):
output.append(u"{:0.3f}: {}".format(score, id2word[wid]))
output.extend([u'',u''])
return u"\n".join(output)
In [6]:
fileids = [doc['Document Title'] for doc in corpus]
with open('data/keyphrases.txt', 'w') as f:
f.write(
score_keyphrases_by_tfidf([candidates(doc['Tokenized Abstract']) for doc in corpus], fileids, 5)
)
In [4]:
corpus[0].keys()
Out[4]:
In [5]:
len(corpus)
Out[5]:
In [6]:
from sklearn.pipeline import Pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
In [7]:
model = Pipeline([
('tfidf', TfidfVectorizer()),
('kmeans', MiniBatchKMeans()),
])
model.fit([doc['Abstract'] for doc in corpus])
Out[7]:
In [8]:
model.get_params()
Out[8]:
In [9]:
# model.steps[-1]
centers = model.named_steps['kmeans'].cluster_centers_
In [10]:
from heapq import nlargest
from operator import itemgetter
def rank_center_terms(k=0, n=10):
words = model.named_steps['tfidf'].inverse_transform([centers[k]])[0]
return nlargest(n, list(zip(words, centers[k])), key=itemgetter(1))
rank_center_terms()
Out[10]:
In [11]:
rank_center_terms(1)
Out[11]:
In [12]:
model.named_steps['kmeans'].labels_
Out[12]:
In [13]:
def titles_for_cluster(k=0):
for idx, label in enumerate(model.named_steps['kmeans'].labels_):
if label == k:
print(corpus[idx]['Document Title'])
#titles_for_cluster()
In [14]:
%matplotlib notebook
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(model.named_steps['kmeans'].labels_)
Out[14]:
In [15]:
import numpy as np
from sklearn.cluster import SpectralClustering
from yellowbrick.cluster import SilhouetteVisualizer
X = TfidfVectorizer().fit_transform([doc['Abstract'] for doc in corpus])
In [16]:
oz = SilhouetteVisualizer(MiniBatchKMeans())
oz.fit(X)
oz.poof()
In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X):
return self
def transform(self, X):
return [
doc[self.column] for doc in X
]
class Tokenizer(BaseEstimator, TransformerMixin):
def __init__(self, sep=";"):
self.sep = sep
def fit(self, X):
return self
def transform(self, X):
return [
{key: 1 for key in doc.split(self.sep)}
for doc in X
]
In [62]:
from sklearn.decomposition import TruncatedSVD
model = Pipeline([
('cols', FeatureUnion([
('abstract', Pipeline([
('select', ColumnSelector('Abstract')),
('tfidf', TfidfVectorizer()),
])),
('key terms', Pipeline([
('select', ColumnSelector('Author Keywords')),
('tokenize', Tokenizer()),
('vect', DictVectorizer()),
]))
])),
('svd', TruncatedSVD(1000))
])
docs = model.fit_transform(corpus)
In [64]:
docs[0].shape
Out[64]:
In [17]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [21]:
tfvec = CountVectorizer(strip_accents = 'unicode', stop_words = 'english', lowercase = True, token_pattern = r'\b[a-zA-Z]{3,}\b', max_df = 0.5, min_df = 10)
dtm_tf = tfvec.fit_transform(ColumnSelector('Abstract').fit_transform(corpus))
In [25]:
model = Pipeline([
('select', ColumnSelector('Abstract')),
('tfidf', TfidfVectorizer(**tfvec.get_params())),
('lda', LatentDirichletAllocation())
])
model.fit(corpus)
Out[25]:
In [26]:
pyLDAvis.sklearn.prepare(model.named_steps['lda'], dtm_tf, tfvec)
Out[26]: