In [1]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now
import spacy
import scattertext

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

%matplotlib inline

In [4]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords

In [5]:
themes = ['adolescent', 'geriatric', 'mnchn', 'specpop']
theme_dict = {}
for theme in themes:
    with open('data/theme_list/'+theme+'_aos.txt', 'r') as f: 
        theme_dict[theme] = tuple([line.split(',') for line in f.readlines()][0])

In [6]:
import pandas as pd
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Document(Base):                  
    __tablename__ = 'api_document'         

    id = Column(Integer, primary_key=True)

    title = Column(psql.TEXT)          
    date = Column(DATE)                
    doctype = Column(psql.TEXT)        
    docnum = Column(psql.TEXT)         
    subject = Column(psql.TEXT)        
    body = Column(psql.TEXT)           
    sign = Column(psql.TEXT)           
    signtitle = Column(psql.TEXT)      
    images = Column(psql.JSONB)        
    raw_body = Column(psql.JSONB)      

    def __repr__(self):                
        return self.title              

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

In [7]:
theme_df = pd.DataFrame()

In [8]:
for theme in themes:
    query = 'SELECT body FROM api_document WHERE title in' + str(theme_dict[theme])

    df = pd.read_sql_query(query, engine)

    df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
    
    df['label'] = theme
    
    theme_df = theme_df.append(df)

LDA Training


In [9]:
stops = set(stopwords.words('english'))  # nltk stopwords list

In [10]:
def process_texts(texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    # 2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    texts = [[word for word in line if word not in stops] for line in texts]
    # texts = [bigram[line] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), 
                    allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
    return texts

In [11]:
theme_df['final'] = ''

In [15]:
for i, r in theme_df.iterrows():
    tmp = process_texts(r['body'])
    theme_df.set_value(i, 'final', tmp)


---------------------------------------------------------------------------
BadZipFile                                Traceback (most recent call last)
<ipython-input-15-7d4c656a93cb> in <module>()
      1 for i, r in theme_df.iterrows():
----> 2     tmp = process_texts(r['body'])
      3     theme_df.set_value(i, 'final', tmp)

<ipython-input-10-ec7d6600afd5> in process_texts(texts)
     18     # texts = [bigram[line] for line in texts]
     19     texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), 
---> 20                     allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
     21     return texts

<ipython-input-10-ec7d6600afd5> in <listcomp>(.0)
     18     # texts = [bigram[line] for line in texts]
     19     texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), 
---> 20                     allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
     21     return texts

~/anaconda3/lib/python3.6/site-packages/gensim/utils.py in lemmatize(content, allowed_tags, light, stopwords, min_length, max_length)
   1061 
   1062     """
-> 1063     if not has_pattern():
   1064         raise ImportError("Pattern library is not installed. Pattern library is needed in order to use lemmatize function")
   1065     from pattern.en import parse

~/anaconda3/lib/python3.6/site-packages/gensim/utils.py in has_pattern()
   1034     """
   1035     try:
-> 1036         from pattern.en import parse  # noqa:F401
   1037         return True
   1038     except ImportError:

~/anaconda3/lib/python3.6/site-packages/pattern/text/en/__init__.py in <module>()
     77 )
     78 # Import all submodules.
---> 79 from pattern.text.en import inflect
     80 from pattern.text.en import wordnet
     81 from pattern.text.en import wordlist

~/anaconda3/lib/python3.6/site-packages/pattern/text/en/__init__.py in <module>()
     78 # Import all submodules.
     79 from pattern.text.en import inflect
---> 80 from pattern.text.en import wordnet
     81 from pattern.text.en import wordlist
     82 

~/anaconda3/lib/python3.6/site-packages/pattern/text/en/wordnet/__init__.py in <module>()
     55 for token in ("wordnet", "wordnet_ic", "sentiwordnet"):
     56     try:
---> 57         nltk.data.find("corpora/" + token)
     58     except LookupError:
     59         try:

~/anaconda3/lib/python3.6/site-packages/nltk/data.py in find(resource_name, paths)
    638                                      [pieces[i] + '.zip'] + pieces[i:])
    639             try:
--> 640                 return find(modified_name, paths)
    641             except LookupError:
    642                 pass

~/anaconda3/lib/python3.6/site-packages/nltk/data.py in find(resource_name, paths)
    624                 if os.path.exists(p):
    625                     try:
--> 626                         return ZipFilePathPointer(p, zipentry)
    627                     except IOError:
    628                         # resource not in zipfile

~/anaconda3/lib/python3.6/site-packages/nltk/compat.py in _decorator(*args, **kwargs)
    219     def _decorator(*args, **kwargs):
    220         args = (args[0], add_py3_data(args[1])) + args[2:]
--> 221         return init_func(*args, **kwargs)
    222     return wraps(init_func)(_decorator)
    223 

~/anaconda3/lib/python3.6/site-packages/nltk/data.py in __init__(self, zipfile, entry)
    471         """
    472         if isinstance(zipfile, string_types):
--> 473             zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile))
    474 
    475         # Normalize the entry string, it should be relative:

~/anaconda3/lib/python3.6/site-packages/nltk/compat.py in _decorator(*args, **kwargs)
    219     def _decorator(*args, **kwargs):
    220         args = (args[0], add_py3_data(args[1])) + args[2:]
--> 221         return init_func(*args, **kwargs)
    222     return wraps(init_func)(_decorator)
    223 

~/anaconda3/lib/python3.6/site-packages/nltk/data.py in __init__(self, filename)
    990         if not isinstance(filename, string_types):
    991             raise TypeError('ReopenableZipFile filename must be a string')
--> 992         zipfile.ZipFile.__init__(self, filename)
    993         assert self.filename == filename
    994         self.close()

~/anaconda3/lib/python3.6/zipfile.py in __init__(self, file, mode, compression, allowZip64)
   1106         try:
   1107             if mode == 'r':
-> 1108                 self._RealGetContents()
   1109             elif mode in ('w', 'x'):
   1110                 # set the modified flag so central directory gets written

~/anaconda3/lib/python3.6/zipfile.py in _RealGetContents(self)
   1173             raise BadZipFile("File is not a zip file")
   1174         if not endrec:
-> 1175             raise BadZipFile("File is not a zip file")
   1176         if self.debug > 1:
   1177             print(endrec)

BadZipFile: File is not a zip file

In [14]:
import itertools
combs = list(itertools.combinations(themes, 2))

In [16]:
combs


Out[16]:
[('adolescent', 'geriatric'),
 ('adolescent', 'mnchn'),
 ('adolescent', 'specpop'),
 ('geriatric', 'mnchn'),
 ('geriatric', 'specpop'),
 ('mnchn', 'specpop')]

In [17]:
import scattertext as st
nlp = spacy.en.English()

for c in combs:
    df = theme_df[theme_df.label.isin(c)]
    corpus = st.CorpusFromPandas(df, 
                                  category_col='label', 
                                  text_col='body',
                                  nlp=nlp).build()
    html = st.produce_scattertext_explorer(corpus,
              category=c[0],
              category_name=c[0],
              not_category_name=c[1],
              width_in_pixels=1000)
    open(c[0]+'_'+c[1]+".html", 'wb').write(html.encode('utf-8'))

In [ ]: