In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import os
import pandas as pd
import cPickle as pickle
import pickle
import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import nltk
import csv
In [ ]:
def paper_dataframe(fpath):
rows = []
with open(fpath, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
for row in reader:
data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType',
'PdfName', 'Abstract', 'PaperText'])
return data
def is_printable(s):
"""True if all characters are printable. Implicitly assume English."""
return all_chars_in(s, string.printable)
def all_chars_in(s, char_set):
for c in s:
if c not in char_set:
return False
return True
def some_chars_in(s, char_set):
for c in s:
if c in char_set:
return True
return False
def no_char_in(s, char_set):
for c in s:
if c in char_set:
return False
return True
def tokenize(text):
text = text.lower()
tok = WhitespaceTokenizer()
words = [w.strip() for w in tok.tokenize(text)]
return words
def uncond_filter_words(list_words):
"""Unconditionally remove words according to some rules"""
L = list_words
filters = [
lambda w: no_char_in(w, string.punctuation),
lambda w: no_char_in(w, '0123456789'),
# start with alphabets
lambda w: re.match('[a-z].+', w.lower()) is not None,
lambda w: len(w) >= 3 and len(w) <= 20,
# only printable characters
lambda w: is_printable(w),
# remove words that do not have English alphabet
lambda w: some_chars_in(w.lower(), string.lowercase),
for i, f in enumerate(filters):
L = filter(f, L)
return L
def list_to_file(L, fpath):
with open(fpath, 'w') as f:
for w in L:
def file_to_list(fpath):
with open(fpath, 'r') as f:
L = f.readlines()
L = [w.strip() for w in L]
return L
In [ ]:
def stack_papers(years, dest_path):
content = ''
for i, y in enumerate(years):
with open('../output%d/Papers.csv'%y, 'r') as pfile:
if i>0:
# skip header
content = content +
# write
with open(dest_path, 'w') as dfile:
def hist_words(list_words):
"""Return a map: word->count from the list of words."""
m = {}
for i, w in enumerate(list_words):
m[w] = 1 if w not in m else m[w] + 1
return m
class CacheStemmer(object):
def __init__(self):
self.stemmer = SnowballStemmer('english')
self.cache = {}
def stem(self, word):
if word in self.cache:
return self.cache[word]
st = self.stemmer.stem(word)
self.cache[word] = st
return st
def stem_words(words, stemmer=CacheStemmer() ):
stem_words = []
for w in words:
s = stemmer.stem(w)
except UnicodeDecodeError as e:
#print('decode error for: %s'%w)
return stem_words
In [ ]:
stemmer = CacheStemmer()
In [ ]:
fyear = 1988
tyear = 2015
stack_fname = 'Papers%d_%d.csv'%(fyear, tyear)
stack_papers(range(fyear, tyear+1), stack_fname)
In [ ]:
stemmer = CacheStemmer()
set_words = set()
paper_folders = ['../output%d'%y for y in range(fyear, tyear+1)]
# dictionary compiled from Scowl spelling checking
#dict_all = file_to_list('dict_all.txt')
# dictionary from Wordnet
dict_all = file_to_list('index.noun')
#dict_all = file_to_list('index.verb')
set_stop = set(file_to_list('stop_words.txt'))
set_dict = set(dict_all)
for oi, out_fol in enumerate(paper_folders):
fpath = os.path.join(out_fol, 'Papers.csv')
data = paper_dataframe(fpath)
for i in range(data.shape[0]):
abstract = data['Abstract'][i]
content = data['PaperText'][i]
title = data['Title'][i]
title_words = tokenize(title)
abs_words = tokenize(abstract)
content_words = tokenize(content)
words = title_words + abs_words + content_words
# only include words (before stemming) that are in the dictionary
# and not in the list of stop words
words = [w for w in words if w in set_dict and w not in set_stop]
stwords = stem_words(words, stemmer)
In [ ]:
uncond_words = uncond_filter_words(set_words)
list_to_file(uncond_words, 'uncond_filtered_words.txt')
print('unconditionally filters words: %d'%(len(uncond_words)))
In [ ]:
w2ind = dict(zip(uncond_words, range(len(uncond_words))) )
data = paper_dataframe(stack_fname)
In [ ]:
doc_ind = []
word_ind = []
counts = []
titles = []
for i in range(data.shape[0]):
abstract = data['Abstract'][i]
content = data['PaperText'][i]
title = data['Title'][i]
title_words = tokenize(title)
abs_words = tokenize(abstract)
content_words = tokenize(content)
di_words = title_words + abs_words + content_words
stwords = stem_words(di_words, stemmer)
hist = hist_words(stwords)
for w, c in hist.iteritems():
if w in w2ind:
# ignore terms which are not in the vocabulary
wi = w2ind[w]
In [ ]:
# save DT to a file
DT = sp.csr_matrix( (counts, (doc_ind, word_ind)), shape=(data.shape[0], len(uncond_words)) )
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
info = {'DT': DT, 'words': uncond_words, 'titles': titles}
with open(dt_fpath, 'w') as f:
pickle.dump(info, f)
with open(dt_fpath, 'r') as f:
info = pickle.load(f)
words = info['words']
DT = info['DT']
titles = info['titles']
# document frequency of each word
n = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 5
df_ub = int(0.4*n)
print('#docs: %d'%n)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'%
(df_lb, df_ub, np.sum( df_I) ) )
plt.xlabel('word index')
plt.ylabel('doc frequency')
df_words = np.array(words)[df_I]
list_to_file(df_words, 'words_df%d_%d.txt'%(df_lb, df_ub))
def has_some_substring(text, substrings):
"""True if the text contains at least one substring in the list."""
for i, s in enumerate(substrings):
if text.lower().find(s) > -1:
return True
return False
stitles = sorted(titles)
supervised_kws = ['large margin', 'classif', 'regression', 'kernel', 'ensemble',
'neural net']
deep_kws = ['deep', 'drop out', 'auto-encod', 'convolutional', 'neural net', 'belief net',
neuro_kws = ['motor control', 'neural', 'neuron', 'spiking', 'spike', 'cortex', 'plasticity',
'neural decod', 'neural encod', 'brain imag', 'biolog', 'perception', 'cognitive',
'emotion', 'synap', 'neural population', 'cortical', 'firing rate', 'firing-rate',
bayesian_kws = ['graphical model', 'bayesian', 'inference', 'mcmc', 'monte carlo',
'posterior', 'prior', 'variational', 'markov', 'latent', 'probabilistic',
'exponential family']
kernel_kws = ['kernel', 'distribution embedding', 'support vector', 'gaussian process']
learning_kws = ['learning theory', 'consistency', 'theoretical guarantee',
'complexity', 'pac-bayes', 'pac-learning', 'generalization',
'uniform converg', 'bound', 'deviation', 'inequality', 'risk min', 'minimax',
'structural risk', 'VC', 'rademacher', 'asymptotic']
rl_kws = ['reinforce', 'regret', 'apprenticeship', 'game', 'TD', 'mdp', 'markov decision',
'agent', 'reward', 'player', 'thompson', 'policy', 'policies', 'value function',
'Q learning', 'Q-learning', 'planning', 'bandit', 'value iteration']
neuro_titles = filter(lambda t: has_some_substring(t, neuro_kws), stitles)
bayesian_titles = filter(lambda t: has_some_substring(t, bayesian_kws), stitles)
deep_titles = [t for t in stitles if has_some_substring(t, deep_kws)]
supervised_titles = [t for t in stitles if has_some_substring(t, supervised_kws)]
kernel_titles = [t for t in stitles if has_some_substring(t, kernel_kws)]
learning_titles = [t for t in stitles if has_some_substring(t, learning_kws)]
rl_titles = [t for t in stitles if has_some_substring(t, rl_kws)]
for w in deep_kws:
print '%s, '%w,
print len(learning_titles)
title_kws1 = bayesian_kws
fname1 = 'bayes'
title_kws2 = bayesian_kws
fname2 = 'bayes'
#title_kws1 = neuro_kws
#fname1 = 'neuro'
#title_kws2 = neuro_kws
#fname2 = 'neuro'
#title_kws1 = deep_kws
#fname1 = 'deep'
#title_kws2 = deep_kws
#fname2 = 'deep'
#title_kws2 = learning_kws
#fname2 = 'learning'
title_kws1 = bayesian_kws
fname1 = 'bayes'
#title_kws2 = deep_kws
#fname2 = 'deep'
set_ind1 = set([i for i in range(n) if has_some_substring(titles[i], title_kws1) ])
set_ind2 = set([i for i in range(n) if has_some_substring(titles[i], title_kws2) ])
#split_mode = 'disjoint'
split_mode = 'random'
if split_mode == 'disjoint':
# split the matched titles into two disjoint sets
common_ind = set_ind1 & set_ind2
# remove common documents in both
doc_I1 = np.array(list(set_ind1.difference(common_ind)))
doc_I2 = np.array(list(set_ind2.difference(common_ind)))
#doc_logI1 = np.zeros(n, dtype=np.bool)
#doc_logI1[doc_I1] = True
#doc_logI2 = np.zeros(n, dtype=np.bool)
#doc_logI2[doc_I2] = True
elif split_mode == 'random':
# consider only fname1 and randomly split the samples into two disjoint halves.
list_ind1 = np.array(list(set_ind1))
half_ind = int(len(list_ind1)/2.001)
split_ind = np.random.choice(len(list_ind1), half_ind, replace=False)
doc_I1 = list_ind1[split_ind]
doc_I2 = np.array(list(set_ind1.difference(set(doc_I1))))
raise ValueError('unknown split_mode')
Pdoc = DT[doc_I1, :]
Qdoc = DT[doc_I2, :]
print('set 1. %d titles:'%len(doc_I1))
for i in doc_I1:
print('set 2. %d titles:'%len(doc_I2))
for i in doc_I2:
# remove all the words that do not occur in any doc
PQdoc = sp.vstack((Pdoc, Qdoc))
pq_occur_wordI = np.array(PQdoc.sum(0) >= 1)[0]
andI = np.logical_and(pq_occur_wordI, df_I)
#andI = df_I
pq_words = np.array(words)[andI]
P = Pdoc[:, andI]
Q = Qdoc[:, andI]
print('words left: %d'%P.shape[1])
import sklearn.cluster as clu
# k-means on the terms
n_clusters = 2000
clust = clu.KMeans(n_clusters=n_clusters, n_init=5,
PQ = sp.vstack((P,Q))
PQ01 = PQ > 0
IDF = (np.log(n) - np.log(DF[andI]))
P_tfidf = P.multiply(IDF)
Q_tfidf = Q.multiply(IDF)
tfidf = sp.vstack((P_tfidf, Q_tfidf))
#row_norms = np.array(PQ.power(2).sum(1))**0.5
#PQ_norm = PQ.multiply(1.0/row_norms)
PQ_norm = tfidf
mode = 'random'
if mode=='random':
# Kacper: pick subset of random words
words_ind = np.random.choice(P_tfidf.shape[1], n_clusters, replace=False)
elif mode=='kmeans':
mode_cluster = stats.mode(clust.labels_)[0][0]
mode_words = pq_words[clust.labels_==mode_cluster]
#for w in mode_words:
# print w,
plt.plot(sorted(clust.labels_), 'o')
plt.xlabel('word index')
plt.ylabel('cluster index')
# first word in each cluster
words_ind = []
for ci in range(n_clusters):
ind = np.where(clust.labels_==ci)[0]
final_words = pq_words[words_ind]
P_clus = P_tfidf[:, words_ind]
Q_clus = Q_tfidf[:, words_ind]
sortI = np.argsort(final_words)
swords = final_words[sortI]
P_sort = P_clus[:, sortI]
Q_sort = Q_clus[:, sortI]
# Construct the final PQ
np_titles = np.array(titles)
P_arr = np.array(P_sort)
Q_arr = np.array(Q_sort)
data = {'P': P_arr, 'Q': Q_arr, 'words': swords,
'P_titles': np_titles[doc_I1], 'Q_titles': np_titles[doc_I2], 'P_theme': fname1,
'Q_theme': fname2}
fname = '%s_%s_np%d_nq%d_d%d.p'%(fname1, fname2, P_sort.shape[0], Q_sort.shape[0], n_clusters)
with open(fname, 'w') as dest:
pickle.dump(data, dest)
# final word list
for (i, w) in enumerate(swords):
if i%10==0:
print w,
In [ ]: