In [ ]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import os
import pandas as pd
try:
    import cPickle as pickle 
except:
    import pickle

import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import nltk
import csv

In [ ]:
def paper_dataframe(fpath):
    rows = []
    with open(fpath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        # Each read gives ['Id', 'Title', 'EventType', 'PdfName', 'Abstract', 'PaperText']
        reader.next()
        for row in reader:
            rows.append(tuple(row))
    data = pd.DataFrame(rows, columns=['Id', 'Title', 'EventType', 
                                   'PdfName', 'Abstract', 'PaperText'])
    return data

def is_printable(s):
    """True if all characters are printable. Implicitly assume English."""
    return all_chars_in(s, string.printable)

def all_chars_in(s, char_set):
    for c in s:
        if c not in char_set:
            return False
    return True

def some_chars_in(s, char_set):
    for c in s:
        if c in char_set:
            return True
    return False

def no_char_in(s, char_set):
    for c in s:
        if c in char_set:
            return False 
    return True

def tokenize(text):
    text = text.lower()
    tok = WhitespaceTokenizer()
    words = [w.strip() for w in tok.tokenize(text)]
    return words

def uncond_filter_words(list_words):
    """Unconditionally remove words according to some rules"""
    L = list_words
    filters = [
        lambda w: no_char_in(w, string.punctuation),
        lambda w: no_char_in(w, '0123456789'),
        # start with alphabets
        lambda w: re.match('[a-z].+', w.lower()) is not None,
        lambda w: len(w) >= 3 and len(w) <= 20, 
         # only printable characters
        lambda w: is_printable(w), 
         # remove words that do not have English alphabet
        lambda w: some_chars_in(w.lower(), string.lowercase),
    ]
    for i, f in enumerate(filters):
        L = filter(f, L)
    return L

def list_to_file(L, fpath):
    with open(fpath, 'w') as f:
        for w in L:
            f.write('%s\n'%w)

def file_to_list(fpath):
    with open(fpath, 'r') as f:
        L = f.readlines()
    L = [w.strip() for w in L]
    return L

In [ ]:
def stack_papers(years, dest_path):
    content = ''
    for i, y in enumerate(years):
        with open('../output%d/Papers.csv'%y, 'r') as pfile:
            if i>0:
                # skip header
                pfile.readline()
            content = content + pfile.read()
            
    # write
    with open(dest_path, 'w') as dfile:
        dfile.write(content)

def hist_words(list_words):
    """Return a map: word->count from the list of words."""
    m = {}
    for i, w in enumerate(list_words):
        m[w] = 1 if w not in m else m[w] + 1
    return m


class CacheStemmer(object):
    def __init__(self):
        self.stemmer = SnowballStemmer('english')
        self.cache = {}
        
    def stem(self, word):
        if word in self.cache:
            return self.cache[word]
        else:
            st = self.stemmer.stem(word)
            self.cache[word] = st
            return st
        
        
def stem_words(words, stemmer=CacheStemmer() ):
    stem_words = []
    for w in words:
        try:
            s = stemmer.stem(w)
            stem_words.append(s)
        except UnicodeDecodeError as e:
            #print('decode error for: %s'%w)
            pass
    return stem_words

In [ ]:
stemmer = CacheStemmer()
stemmer.stem('recognize')

In [ ]:
fyear = 1988
tyear = 2015
stack_fname = 'Papers%d_%d.csv'%(fyear, tyear)
stack_papers(range(fyear, tyear+1), stack_fname)

Vocabulary

Get the list of total words


In [ ]:
stemmer = CacheStemmer()
set_words = set()
paper_folders = ['../output%d'%y for y in range(fyear, tyear+1)]

# dictionary compiled from Scowl spelling checking
#dict_all = file_to_list('dict_all.txt')
# dictionary from Wordnet
dict_all = file_to_list('index.noun')
#dict_all = file_to_list('index.verb')

set_stop = set(file_to_list('stop_words.txt'))
set_dict = set(dict_all)
for oi, out_fol in enumerate(paper_folders):
    fpath = os.path.join(out_fol, 'Papers.csv')
    data = paper_dataframe(fpath)

    for i in range(data.shape[0]):
        abstract = data['Abstract'][i]
        content = data['PaperText'][i]
        title = data['Title'][i]

        title_words = tokenize(title)
        abs_words = tokenize(abstract)
        content_words = tokenize(content)
        words = title_words + abs_words + content_words
        # only include words (before stemming) that are in the dictionary
        # and not in the list of stop words
        words = [w for w in words if w in set_dict and w not in set_stop]
        stwords = stem_words(words, stemmer)
        set_words.update(stwords)

In [ ]:
uncond_words = uncond_filter_words(set_words)
uncond_words.sort()
list_to_file(uncond_words, 'uncond_filtered_words.txt')
print('unconditionally filters words: %d'%(len(uncond_words)))

Document-Term matrix


In [ ]:
w2ind = dict(zip(uncond_words, range(len(uncond_words))) )
data = paper_dataframe(stack_fname)

In [ ]:
doc_ind = []
word_ind = []
counts = []
titles = []
for i in range(data.shape[0]):
    abstract = data['Abstract'][i]
    content = data['PaperText'][i]
    title = data['Title'][i]
    titles.append(title)

    title_words = tokenize(title)
    abs_words = tokenize(abstract)
    content_words = tokenize(content)
    di_words = title_words + abs_words + content_words
    stwords = stem_words(di_words, stemmer)
    hist = hist_words(stwords)
    for w, c in hist.iteritems():
        if w in w2ind:
            # ignore terms which are not in the vocabulary
            wi = w2ind[w]
            doc_ind.append(i)
            word_ind.append(wi)
            counts.append(c)

In [ ]:
# save DT to a file 
DT = sp.csr_matrix( (counts, (doc_ind, word_ind)), shape=(data.shape[0], len(uncond_words)) )
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
info = {'DT': DT, 'words': uncond_words, 'titles': titles}
with open(dt_fpath, 'w') as f:
    pickle.dump(info, f)

In [ ]:
DT.shape

Filter DTM


In [ ]:
with open(dt_fpath, 'r') as f:
    info = pickle.load(f)
    
words = info['words']
DT = info['DT']
titles = info['titles']

In [ ]:
# document frequency of each word
n = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 5
df_ub = int(0.4*n)

print('#docs: %d'%n)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'% 
      (df_lb, df_ub, np.sum( df_I) ) )

In [ ]:
plt.plot(sorted(DF))
plt.xlabel('word index')
plt.ylabel('doc frequency')

In [ ]:
df_words = np.array(words)[df_I]
df_words.tolist()
list_to_file(df_words, 'words_df%d_%d.txt'%(df_lb, df_ub))

Categorize documents


In [ ]:
def has_some_substring(text, substrings):
    """True if the text contains at least one substring in the list."""
    for i, s in enumerate(substrings):
        if text.lower().find(s) > -1:
            return True
    return False

In [ ]:
stitles = sorted(titles)

supervised_kws = ['large margin', 'classif', 'regression', 'kernel', 'ensemble', 
                  'neural net']
deep_kws = ['deep', 'drop out', 'auto-encod', 'convolutional', 'neural net', 'belief net', 
           'boltzmann']
neuro_kws = ['motor control', 'neural', 'neuron', 'spiking', 'spike', 'cortex', 'plasticity', 
            'neural decod', 'neural encod', 'brain imag', 'biolog', 'perception', 'cognitive', 
            'emotion', 'synap', 'neural population', 'cortical', 'firing rate', 'firing-rate', 
            'sensor']
bayesian_kws = ['graphical model', 'bayesian', 'inference', 'mcmc', 'monte carlo', 
               'posterior', 'prior', 'variational', 'markov', 'latent', 'probabilistic', 
               'exponential family']
kernel_kws = ['kernel', 'distribution embedding', 'support vector', 'gaussian process']
learning_kws = ['learning theory', 'consistency', 'theoretical guarantee', 
                'complexity', 'pac-bayes', 'pac-learning', 'generalization', 
                'uniform converg', 'bound', 'deviation', 'inequality', 'risk min', 'minimax', 
               'structural risk', 'VC', 'rademacher', 'asymptotic']
rl_kws = ['reinforce', 'regret', 'apprenticeship', 'game', 'TD', 'mdp', 'markov decision', 
         'agent', 'reward', 'player', 'thompson', 'policy', 'policies', 'value function', 
          'Q learning', 'Q-learning', 'planning', 'bandit', 'value iteration']

neuro_titles = filter(lambda t: has_some_substring(t, neuro_kws), stitles)
bayesian_titles = filter(lambda t: has_some_substring(t, bayesian_kws), stitles)
deep_titles = [t for t in stitles if has_some_substring(t, deep_kws)]
supervised_titles = [t for t in stitles if has_some_substring(t, supervised_kws)]
kernel_titles = [t for t in stitles if has_some_substring(t, kernel_kws)]
learning_titles = [t for t in stitles if has_some_substring(t, learning_kws)]
rl_titles = [t for t in stitles if has_some_substring(t, rl_kws)]

In [ ]:
for w in deep_kws:
    print '%s, '%w,

In [ ]:
print len(learning_titles)
learning_titles

In [ ]:
title_kws1 = bayesian_kws
fname1 = 'bayes'
title_kws2 = bayesian_kws
fname2 = 'bayes'

#title_kws1 = neuro_kws
#fname1 = 'neuro'

#title_kws2 = neuro_kws
#fname2 = 'neuro'

#title_kws1 = deep_kws
#fname1 = 'deep'
#title_kws2 = deep_kws
#fname2 = 'deep'

#title_kws2 = learning_kws
#fname2 = 'learning'

"""
title_kws1 = bayesian_kws
fname1 = 'bayes'
"""
#title_kws2 = deep_kws
#fname2 = 'deep'

np.random.seed(2990)
set_ind1 = set([i for i in range(n) if has_some_substring(titles[i], title_kws1) ])
set_ind2 = set([i for i in range(n) if has_some_substring(titles[i], title_kws2) ])    
    
#split_mode = 'disjoint'
split_mode = 'random'

if split_mode == 'disjoint':
    # split the matched titles into two disjoint sets
    common_ind = set_ind1 & set_ind2
    # remove common documents in both
    doc_I1 = np.array(list(set_ind1.difference(common_ind)))
    doc_I2 = np.array(list(set_ind2.difference(common_ind)))
    #doc_logI1 = np.zeros(n, dtype=np.bool)
    #doc_logI1[doc_I1] = True
    #doc_logI2 = np.zeros(n, dtype=np.bool)
    #doc_logI2[doc_I2] = True
elif split_mode == 'random':
    # consider only fname1 and randomly split the samples into two disjoint halves.
    list_ind1 = np.array(list(set_ind1))
    half_ind = int(len(list_ind1)/2.001)
    split_ind = np.random.choice(len(list_ind1), half_ind, replace=False)
    doc_I1 = list_ind1[split_ind]
    doc_I2 = np.array(list(set_ind1.difference(set(doc_I1))))

else:
    raise ValueError('unknown split_mode')
Pdoc = DT[doc_I1, :]
Qdoc = DT[doc_I2, :]

In [ ]:
len(doc_I1)

In [ ]:
print('set 1. %d titles:'%len(doc_I1))
for i in doc_I1:
    print(titles[i])

In [ ]:
print('set 2. %d titles:'%len(doc_I2))
for i in doc_I2:
    print(titles[i])

In [ ]:
# remove all the words that do not occur in any doc
PQdoc = sp.vstack((Pdoc, Qdoc))
pq_occur_wordI = np.array(PQdoc.sum(0) >= 1)[0]
andI = np.logical_and(pq_occur_wordI, df_I)
#andI = df_I
pq_words = np.array(words)[andI]
P = Pdoc[:, andI]
Q = Qdoc[:, andI]

In [ ]:
print('words left: %d'%P.shape[1])

In [ ]:
import sklearn.cluster as clu
# k-means on the terms

n_clusters = 2000
clust = clu.KMeans(n_clusters=n_clusters, n_init=5, 
                   #init='random', 
                   random_state=12)
PQ = sp.vstack((P,Q))
PQ01 = PQ > 0

In [ ]:
IDF = (np.log(n) -  np.log(DF[andI]))
P_tfidf = P.multiply(IDF)
Q_tfidf = Q.multiply(IDF)
tfidf = sp.vstack((P_tfidf, Q_tfidf))
#row_norms = np.array(PQ.power(2).sum(1))**0.5
#PQ_norm = PQ.multiply(1.0/row_norms)
PQ_norm = tfidf

In [ ]:
mode = 'random'
if mode=='random':
    # Kacper: pick subset of random words
    words_ind = np.random.choice(P_tfidf.shape[1], n_clusters, replace=False)
elif mode=='kmeans':
    clust.fit(PQ_norm.T)
    mode_cluster = stats.mode(clust.labels_)[0][0]
    mode_words = pq_words[clust.labels_==mode_cluster]
    #for w in mode_words:
    #    print w,
    plt.plot(sorted(clust.labels_), 'o')
    plt.xlabel('word index')
    plt.ylabel('cluster index')
    # first word in each cluster
    words_ind = []
    for ci in range(n_clusters):
        ind = np.where(clust.labels_==ci)[0]
        words_ind.append(ind[0])

Save the data


In [ ]:
final_words = pq_words[words_ind]
P_clus = P_tfidf[:, words_ind]
Q_clus = Q_tfidf[:, words_ind]

sortI = np.argsort(final_words)
swords = final_words[sortI]
P_sort = P_clus[:, sortI]
Q_sort = Q_clus[:, sortI]

# Construct the final PQ
np_titles = np.array(titles)
P_arr = np.array(P_sort)
Q_arr = np.array(Q_sort)
data = {'P': P_arr, 'Q': Q_arr, 'words': swords, 
        'P_titles': np_titles[doc_I1], 'Q_titles': np_titles[doc_I2], 'P_theme': fname1, 
       'Q_theme': fname2}

fname = '%s_%s_np%d_nq%d_d%d.p'%(fname1, fname2, P_sort.shape[0], Q_sort.shape[0], n_clusters)
with open(fname, 'w') as dest:
    pickle.dump(data, dest)

In [ ]:
# final word list
for (i, w) in enumerate(swords):
    if i%10==0:
        print('')
    print w,

In [ ]: