In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
themes = ['mnchn', 'adolescent', 'geriatrics', 'specpop']
keywords = {
    theme: pd.read_excel('data/cycle_1/'+theme+'.xlsx', usecols=['AO', 'Final Keywords'], dtype=
                         {'Keywords': str}).fillna(method='ffill') for theme in themes
}
keywords_final = {}
for theme in themes:
    keywords[theme]['Final Keywords'] += ','
    keywords_final[theme] = keywords[theme].groupby('AO').sum()['Final Keywords'].str.replace(
        '"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '')

In [5]:
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Document(Base):                  
    __tablename__ = 'document'         

    id = Column(Integer, primary_key=True)

    title = Column(psql.TEXT)          
    date = Column(DATE)                
    doctype = Column(psql.TEXT)        
    docnum = Column(psql.TEXT)         
    subject = Column(psql.TEXT)        
    body = Column(psql.TEXT)           
    sign = Column(psql.TEXT)           
    signtitle = Column(psql.TEXT)      
    images = Column(psql.JSONB)        
    raw_json = Column(psql.JSONB)      

    def __repr__(self):                
        return self.title

In [6]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

In [7]:
query = 'SELECT title, body FROM Document'

In [8]:
df = pd.read_sql_query(query, engine)

In [9]:
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')

In [10]:
df_mnchn = df[df.title.isin(keywords_final['mnchn'].index.values)]
df_adolescent = df[df.title.isin(keywords_final['adolescent'].index.values)]
df_geriatric = df[df.title.isin(keywords_final['geriatrics'].index.values)]
df_specpop = df[df.title.isin(keywords_final['specpop'].index.values)]

In [11]:
df_mnchn = df.merge(keywords_final['mnchn'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_adolescent = df.merge(keywords_final['adolescent'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_geriatric = df.merge(keywords_final['geriatrics'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_specpop = df.merge(keywords_final['specpop'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)

In [11]:
print(set.difference(set(keywords_final['specpop'].index), set(df_specpop.AO)))
print(set.difference(set(keywords_final['geriatrics'].index), set(df_geriatric.AO)))
print(set.difference(set(keywords_final['adolescent'].index), set(df_adolescent.AO)))
print(set.difference(set(keywords_final['mnchn'].index), set(df_mnchn.AO)))


set()
set()
set()
set()

Labels:

  • MNCHN = 1
  • Adolescent = 2
  • Geriatrics = 3
  • Special Populations = 4
  • Other = 5 but disregard for 1st cycle

In [12]:
df_train = pd.DataFrame()
# MNCHN
df_train['body'] = df_mnchn['body'].append(df_mnchn['Final Keywords'])
df_train['label'] = 1
# Adolescent
df_train = df_train.append(pd.DataFrame({
    'body': df_adolescent['body'].append(df_adolescent['Final Keywords']),
    'label': 2
}))
# Geriatrics
df_train = df_train.append(pd.DataFrame({
    'body': df_geriatric['body'].append(df_geriatric['Final Keywords']),
    'label': 3
}))
# Special Populations
df_train = df_train.append(pd.DataFrame({
    'body': df_specpop['body'].append(df_specpop['Final Keywords']),
    'label': 4
}))
df_train.reset_index(drop=True, inplace=True)
# Other Disregard atm
# df_train = df_train.append(pd.DataFrame({
#     'body': df_specpop['body'].append(df_specpop['Final Keywords']),
#     'label': 4
#     set.difference(set(df.title),set(df_mnchn.AO).union(
#     set(df_adolescent.AO)).union(
#     set(df_geriatric.AO)).union(
#     set(df_specpop.AO)))
# }))

Classification Pipeline


In [13]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

tfid_params = {
    'stop_words':stopwords.words(), 
    'ngram_range': (1,4), 
    'strip_accents':'ascii',
    
}

text_clf = Pipeline([('vect_tfid', TfidfVectorizer(**tfid_params)),
                     ('clf', MultinomialNB()),
                     ])

In [14]:
model_cycle_1 = text_clf.fit(df_train.body, df_train.label)

In [15]:
results = pd.DataFrame(model_cycle_1.predict_proba(df.body), columns=themes)
results['AO'] = df.title

In [16]:
for theme in themes:
    results.sort_values(by=theme, ascending=False)[:40][['AO', theme]].to_csv(theme+'_cycle1_results.csv', index=False)

In [14]:
df_mnchn.drop('body', axis=1).to_csv('mnchn_cycle1_keywords.csv', index=False)
df_geriatric.drop('body', axis=1).to_csv('geriatric_cycle1_keywords.csv', index=False)
df_adolescent.drop('body', axis=1).to_csv('adolescent_cycle1_keywords.csv', index=False)
df_specpop.drop('body', axis=1).to_csv('specpop_cycle1_keywords.csv', index=False)

In [ ]: