Process document-term matrix

%load_ext autoreload
%autoreload 2
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
#config InlineBackend.figure_format = 'pdf'
from IPython.core.display import HTML
import matplotlib.pyplot as plt
from nltk.tokenize import WhitespaceTokenizer
import numpy as np
import os
import pandas as pd
    import cPickle as pickle 
    import pickle

import re
import scipy.stats as stats
import scipy.sparse as sp
import string
import sys
import nltk
import csv

fyear = 1988
tyear = 2015
dt_fpath = 'DT_%d_%d.p'%(fyear, tyear)
with open(dt_fpath, 'r') as f:
    info = pickle.load(f)

words = info['words']
DT = info['DT']
titles = info['titles']

Filter DTM

# document frequency of each word
n = DT.shape[0]
DF = np.array( (DT > 0).sum(0) )[0]
df_lb = 5
df_ub = int(0.15*n)

print('n = #docs: %d'%n)
print('original #words: %d'%len(words))
print('#words with %d <= df: %d'% (df_lb, np.sum(DF>=df_lb) ) )
print('#words with df <= %d: %d'% (df_ub, np.sum(DF<=df_ub) ) )
df_I = np.logical_and(DF>=df_lb, DF<=df_ub)
print('#words with %d <= df <= %d: %d'% 
      (df_lb, df_ub, np.sum( df_I) ) )

df_words = np.array(words)[df_I]

