In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import os
import pandas as pd
try:
import cPickle as pickle
except:
import pickle
import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import nltk
import csv
In [ ]:
def paper_dataframe(fpath):
rows = []
with open(fpath, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
reader.next()
for row in reader:
rows.append(tuple(row))
data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType',
'PdfName', 'Abstract', 'PaperText'])
return data
def is_printable(s):
"""True if all characters are printable. Implicitly assume English."""
return all_chars_in(s, string.printable)
def all_chars_in(s, char_set):
for c in s:
if c not in char_set:
return False
return True
def some_chars_in(s, char_set):
for c in s:
if c in char_set:
return True
return False
def no_char_in(s, char_set):
for c in s:
if c in char_set:
return False
return True
def tokenize(text):
text = text.lower()
tok = WhitespaceTokenizer()
words = [w.strip() for w in tok.tokenize(text)]
return words
def uncond_filter_words(list_words):
"""Unconditionally remove words according to some rules"""
L = list_words
filters = [
lambda w: no_char_in(w, string.punctuation),
lambda w: no_char_in(w, '0123456789'),
# start with alphabets
lambda w: re.match('[a-z].+', w.lower()) is not None,
lambda w: len(w) >= 3 and len(w) <= 20,
# only printable characters
lambda w: is_printable(w),
# remove words that do not have English alphabet
lambda w: some_chars_in(w.lower(), string.lowercase),
]
for i, f in enumerate(filters):
L = filter(f, L)
return L
def list_to_file(L, fpath):
with open(fpath, 'w') as f:
for w in L:
f.write('%s\n'%w)
def file_to_list(fpath):
with open(fpath, 'r') as f:
L = f.readlines()
L = [w.strip() for w in L]
return L
In [ ]:
def stack_papers(years, dest_path):
content = ''
for i, y in enumerate(years):
with open('../output%d/Papers.csv'%y, 'r') as pfile:
if i>0:
# skip header
pfile.readline()
content = content + pfile.read()
# write
with open(dest_path, 'w') as dfile:
dfile.write(content)
def hist_words(list_words):
"""Return a map: word->count from the list of words."""
m = {}
for i, w in enumerate(list_words):
m[w] = 1 if w not in m else m[w] + 1
return m
class CacheStemmer(object):
def __init__(self):
self.stemmer = SnowballStemmer('english')
self.cache = {}
def stem(self, word):
if word in self.cache:
return self.cache[word]
else:
st = self.stemmer.stem(word)
self.cache[word] = st
return st
def stem_words(words, stemmer=CacheStemmer() ):
stem_words = []
for w in words:
try:
s = stemmer.stem(w)
stem_words.append(s)
except UnicodeDecodeError as e:
#print('decode error for: %s'%w)
pass
return stem_words
In [ ]:
stemmer = CacheStemmer()
stemmer.stem('recognize')
In [ ]:
fyear = 1988
tyear = 2015
stack_fname = 'Papers%d_%d.csv'%(fyear, tyear)
stack_papers(range(fyear, tyear+1), stack_fname)
In [ ]:
stemmer = CacheStemmer()
set_words = set()
paper_folders = ['../output%d'%y for y in range(fyear, tyear+1)]
# dictionary compiled from Scowl spelling checking
#dict_all = file_to_list('dict_all.txt')
# dictionary from Wordnet
dict_all = file_to_list('index.noun')
#dict_all = file_to_list('index.verb')
set_stop = set(file_to_list('stop_words.txt'))
set_dict = set(dict_all)
for oi, out_fol in enumerate(paper_folders):
fpath = os.path.join(out_fol, 'Papers.csv')
data = paper_dataframe(fpath)
for i in range(data.shape[0]):
abstract = data['Abstract'][i]
content = data['PaperText'][i]
title = data['Title'][i]
title_words = tokenize(title)
abs_words = tokenize(abstract)
content_words = tokenize(content)
words = title_words + abs_words + content_words
# only include words (before stemming) that are in the dictionary
# and not in the list of stop words
words = [w for w in words if w in set_dict and w not in set_stop]
stwords = stem_words(words, stemmer)
set_words.update(stwords)
In [ ]:
uncond_words = uncond_filter_words(set_words)
uncond_words.sort()
list_to_file(uncond_words, 'uncond_filtered_words.txt')
print('unconditionally filters words: %d'%(len(uncond_words)))
In [ ]:
w2ind = dict(zip(uncond_words, range(len(uncond_words))) )
data = paper_dataframe(stack_fname)
In [ ]:
doc_ind = []
word_ind = []
counts = []
titles = []
for i in range(data.shape[0]):
abstract = data['Abstract'][i]
content = data['PaperText'][i]
title = data['Title'][i]
titles.append(title)
title_words = tokenize(title)
abs_words = tokenize(abstract)
content_words = tokenize(content)
di_words = title_words + abs_words + content_words
stwords = stem_words(di_words, stemmer)
hist = hist_words(stwords)
for w, c in hist.iteritems():
if w in w2ind:
# ignore terms which are not in the vocabulary
wi = w2ind[w]
doc_ind.append(i)
word_ind.append(wi)
counts.append(c)
In [ ]:
# save DT to a file
DT = sp.csr_matrix( (counts, (doc_ind, word_ind)), shape=(data.shape[0], len(uncond_words)) )
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
info = {'DT': DT, 'words': uncond_words, 'titles': titles}
with open(dt_fpath, 'w') as f:
pickle.dump(info, f)
In [ ]:
DT.shape
In [ ]:
with open(dt_fpath, 'r') as f:
info = pickle.load(f)
words = info['words']
DT = info['DT']
titles = info['titles']
In [ ]:
# document frequency of each word
n = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 5
df_ub = int(0.4*n)
print('#docs: %d'%n)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'%
(df_lb, df_ub, np.sum( df_I) ) )
In [ ]:
plt.plot(sorted(DF))
plt.xlabel('word index')
plt.ylabel('doc frequency')
In [ ]:
df_words = np.array(words)[df_I]
df_words.tolist()
list_to_file(df_words, 'words_df%d_%d.txt'%(df_lb, df_ub))
In [ ]:
def has_some_substring(text, substrings):
"""True if the text contains at least one substring in the list."""
for i, s in enumerate(substrings):
if text.lower().find(s) > -1:
return True
return False
In [ ]:
stitles = sorted(titles)
supervised_kws = ['large margin', 'classif', 'regression', 'kernel', 'ensemble',
'neural net']
deep_kws = ['deep', 'drop out', 'auto-encod', 'convolutional', 'neural net', 'belief net',
'boltzmann']
neuro_kws = ['motor control', 'neural', 'neuron', 'spiking', 'spike', 'cortex', 'plasticity',
'neural decod', 'neural encod', 'brain imag', 'biolog', 'perception', 'cognitive',
'emotion', 'synap', 'neural population', 'cortical', 'firing rate', 'firing-rate',
'sensor']
bayesian_kws = ['graphical model', 'bayesian', 'inference', 'mcmc', 'monte carlo',
'posterior', 'prior', 'variational', 'markov', 'latent', 'probabilistic',
'exponential family']
kernel_kws = ['kernel', 'distribution embedding', 'support vector', 'gaussian process']
learning_kws = ['learning theory', 'consistency', 'theoretical guarantee',
'complexity', 'pac-bayes', 'pac-learning', 'generalization',
'uniform converg', 'bound', 'deviation', 'inequality', 'risk min', 'minimax',
'structural risk', 'VC', 'rademacher', 'asymptotic']
rl_kws = ['reinforce', 'regret', 'apprenticeship', 'game', 'TD', 'mdp', 'markov decision',
'agent', 'reward', 'player', 'thompson', 'policy', 'policies', 'value function',
'Q learning', 'Q-learning', 'planning', 'bandit', 'value iteration']
neuro_titles = filter(lambda t: has_some_substring(t, neuro_kws), stitles)
bayesian_titles = filter(lambda t: has_some_substring(t, bayesian_kws), stitles)
deep_titles = [t for t in stitles if has_some_substring(t, deep_kws)]
supervised_titles = [t for t in stitles if has_some_substring(t, supervised_kws)]
kernel_titles = [t for t in stitles if has_some_substring(t, kernel_kws)]
learning_titles = [t for t in stitles if has_some_substring(t, learning_kws)]
rl_titles = [t for t in stitles if has_some_substring(t, rl_kws)]
In [ ]:
for w in deep_kws:
print '%s, '%w,
In [ ]:
print len(learning_titles)
learning_titles
In [ ]:
title_kws1 = bayesian_kws
fname1 = 'bayes'
title_kws2 = bayesian_kws
fname2 = 'bayes'
#title_kws1 = neuro_kws
#fname1 = 'neuro'
#title_kws2 = neuro_kws
#fname2 = 'neuro'
#title_kws1 = deep_kws
#fname1 = 'deep'
#title_kws2 = deep_kws
#fname2 = 'deep'
#title_kws2 = learning_kws
#fname2 = 'learning'
"""
title_kws1 = bayesian_kws
fname1 = 'bayes'
"""
#title_kws2 = deep_kws
#fname2 = 'deep'
np.random.seed(2990)
set_ind1 = set([i for i in range(n) if has_some_substring(titles[i], title_kws1) ])
set_ind2 = set([i for i in range(n) if has_some_substring(titles[i], title_kws2) ])
#split_mode = 'disjoint'
split_mode = 'random'
if split_mode == 'disjoint':
# split the matched titles into two disjoint sets
common_ind = set_ind1 & set_ind2
# remove common documents in both
doc_I1 = np.array(list(set_ind1.difference(common_ind)))
doc_I2 = np.array(list(set_ind2.difference(common_ind)))
#doc_logI1 = np.zeros(n, dtype=np.bool)
#doc_logI1[doc_I1] = True
#doc_logI2 = np.zeros(n, dtype=np.bool)
#doc_logI2[doc_I2] = True
elif split_mode == 'random':
# consider only fname1 and randomly split the samples into two disjoint halves.
list_ind1 = np.array(list(set_ind1))
half_ind = int(len(list_ind1)/2.001)
split_ind = np.random.choice(len(list_ind1), half_ind, replace=False)
doc_I1 = list_ind1[split_ind]
doc_I2 = np.array(list(set_ind1.difference(set(doc_I1))))
else:
raise ValueError('unknown split_mode')
Pdoc = DT[doc_I1, :]
Qdoc = DT[doc_I2, :]
In [ ]:
len(doc_I1)
In [ ]:
print('set 1. %d titles:'%len(doc_I1))
for i in doc_I1:
print(titles[i])
In [ ]:
print('set 2. %d titles:'%len(doc_I2))
for i in doc_I2:
print(titles[i])
In [ ]:
# remove all the words that do not occur in any doc
PQdoc = sp.vstack((Pdoc, Qdoc))
pq_occur_wordI = np.array(PQdoc.sum(0) >= 1)[0]
andI = np.logical_and(pq_occur_wordI, df_I)
#andI = df_I
pq_words = np.array(words)[andI]
P = Pdoc[:, andI]
Q = Qdoc[:, andI]
In [ ]:
print('words left: %d'%P.shape[1])
In [ ]:
import sklearn.cluster as clu
# k-means on the terms
n_clusters = 2000
clust = clu.KMeans(n_clusters=n_clusters, n_init=5,
#init='random',
random_state=12)
PQ = sp.vstack((P,Q))
PQ01 = PQ > 0
In [ ]:
IDF = (np.log(n) - np.log(DF[andI]))
P_tfidf = P.multiply(IDF)
Q_tfidf = Q.multiply(IDF)
tfidf = sp.vstack((P_tfidf, Q_tfidf))
#row_norms = np.array(PQ.power(2).sum(1))**0.5
#PQ_norm = PQ.multiply(1.0/row_norms)
PQ_norm = tfidf
In [ ]:
mode = 'random'
if mode=='random':
# Kacper: pick subset of random words
words_ind = np.random.choice(P_tfidf.shape[1], n_clusters, replace=False)
elif mode=='kmeans':
clust.fit(PQ_norm.T)
mode_cluster = stats.mode(clust.labels_)[0][0]
mode_words = pq_words[clust.labels_==mode_cluster]
#for w in mode_words:
# print w,
plt.plot(sorted(clust.labels_), 'o')
plt.xlabel('word index')
plt.ylabel('cluster index')
# first word in each cluster
words_ind = []
for ci in range(n_clusters):
ind = np.where(clust.labels_==ci)[0]
words_ind.append(ind[0])
In [ ]:
final_words = pq_words[words_ind]
P_clus = P_tfidf[:, words_ind]
Q_clus = Q_tfidf[:, words_ind]
sortI = np.argsort(final_words)
swords = final_words[sortI]
P_sort = P_clus[:, sortI]
Q_sort = Q_clus[:, sortI]
# Construct the final PQ
np_titles = np.array(titles)
P_arr = np.array(P_sort)
Q_arr = np.array(Q_sort)
data = {'P': P_arr, 'Q': Q_arr, 'words': swords,
'P_titles': np_titles[doc_I1], 'Q_titles': np_titles[doc_I2], 'P_theme': fname1,
'Q_theme': fname2}
fname = '%s_%s_np%d_nq%d_d%d.p'%(fname1, fname2, P_sort.shape[0], Q_sort.shape[0], n_clusters)
with open(fname, 'w') as dest:
pickle.dump(data, dest)
In [ ]:
# final word list
for (i, w) in enumerate(swords):
if i%10==0:
print('')
print w,
In [ ]: