Process document-term matrix



In [ ]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
try:
    import cPickle as pickle 
except:
    import pickle

import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import nltk
import csv



In [ ]:

    
fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
with open(dt_fpath, 'r') as f:
    info = pickle.load(f)



In [ ]:

    
words = info['words']
DT = info['DT']
titles = info['titles']



In [ ]:

    
info.keys()

Filter DTM



In [ ]:

    
# document frequency of each word
n = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 5
df_ub = int(0.15*n)

print('n = #docs: %d'%n)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'% 
      (df_lb, df_ub, np.sum( df_I) ) )



In [ ]:

    
df_words = np.array(words)[df_I]
df_words.tolist()



In [ ]:



In [ ]:



In [ ]: