In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
themes = ['mnchn', 'adolescent', 'geriatrics', 'specpop']
keywords = {
    theme: pd.read_csv('data/cycle_3/'+theme+'_keywords.csv', dtype=
                         {'Keywords': str}).fillna(method='ffill') for theme in themes
}

flags = {
        theme: pd.read_csv('data/cycle_3/'+theme+'_flagged.csv').fillna(method='ffill') for theme in themes
}

keywords_final = {}
for theme in themes:
    keywords[theme].columns = keywords[theme].columns.str.strip()
    keywords[theme]['cycle_2_keywords'] += ','
    keywords_final[theme] = keywords[theme].groupby('AO').sum()['cycle_2_keywords'].str.replace(
        '"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '')

In [4]:
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Document(Base):
    __tablename__ = 'document'

    id = Column(Integer, primary_key=True)
    title = Column(psql.TEXT)
    date = Column(DATE)
    created = Column(DATE, default=datetime.datetime.now)
    modified = Column(DATE, default=datetime.datetime.now)
    doctype = Column(psql.TEXT)
    docnum = Column(psql.TEXT)
    subject = Column(psql.TEXT)
    body = Column(psql.TEXT)
    sign = Column(psql.TEXT)
    signtitle = Column(psql.TEXT)
    images = Column(psql.JSONB)
    raw_json = Column(psql.JSONB)

    def __repr__(self):
        return self.title

In [5]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

In [6]:
query = 'SELECT title, body FROM Document'

In [7]:
df = pd.read_sql_query(query, engine)

In [8]:
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')

In [9]:
df_mnchn_flag = flags['mnchn'].merge(keywords_final['mnchn'].reset_index(), on='AO', how='outer').drop('mnchn', axis=1)
df_mnchn_flag['FLAG'] = df_mnchn_flag['FLAG'].fillna(1)

df_adolescent_flag = flags['adolescent'].merge(keywords_final['adolescent'].reset_index(), on='AO', how='outer').drop('adolescent', axis=1)
df_adolescent_flag['FLAG'] = df_adolescent_flag['FLAG'].fillna(1)

df_geriatric_flag = flags['geriatrics'].merge(keywords_final['geriatrics'].reset_index(), on='AO', how='outer').drop('geriatrics', axis=1)
df_geriatric_flag['FLAG'] = df_geriatric_flag['FLAG'].fillna(1)


df_specpop_flag = flags['specpop'].merge(keywords_final['specpop'].reset_index(), on='AO', how='outer').drop('specpop', axis=1)
df_specpop_flag['FLAG'] = df_specpop_flag['FLAG'].fillna(1)

In [10]:
df_mnchn = df_mnchn_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_adolescent = df_adolescent_flag.merge(df,left_on='AO', right_on='title').drop('title', axis=1)
df_geriatric = df_geriatric_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_specpop = df_specpop_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)

In [11]:
print(set.difference(set(df_specpop_flag.AO), set(df_specpop.AO)))
print(set.difference(set(df_geriatric_flag.AO), set(df_geriatric.AO)))
print(set.difference(set(df_adolescent_flag.AO), set(df_adolescent.AO)))
print(set.difference(set(df_mnchn_flag.AO), set(df_mnchn.AO)))


set()
set()
set()
set()

Labels:

  • MNCHN = 1
  • Adolescent = 2
  • Geriatrics = 3
  • Special Populations = 4

Reviewer feedback is integrated via weights where FLAG is to weight:

  • 1 : 1
  • 0.5 : 0.5
  • 0 : 0.001

In [12]:
seed = 427
# MNCHN
df_train_mnchn = pd.DataFrame()
df_train_mnchn['body'] = df_mnchn['body'].append(df_mnchn['cycle_2_keywords'])
df_train_mnchn['label'] = np.ceil(df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace('', 0))
df_train_mnchn['weights'] = df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace(0, 0.001)
df_train_mnchn = df_train_mnchn.dropna()
df_train_mnchn = df_train_mnchn[(df_train_mnchn.body != '') & (df_train_mnchn.body != ' ')]
df_train_mnchn = df_train_mnchn.drop_duplicates('body')

# Adolescent
df_train_adolescent = pd.DataFrame()
df_train_adolescent['body'] = df_adolescent['body'].append(df_adolescent['cycle_2_keywords'])
df_train_adolescent['label'] = np.ceil(df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace('', 0))
df_train_adolescent['weights'] = df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace(0, 0.001)
df_train_adolescent = df_train_adolescent.dropna()
df_train_adolescent = df_train_adolescent[(df_train_adolescent.body != '') & (df_train_adolescent.body != ' ')]
df_train_adolescent = df_train_adolescent.drop_duplicates('body')

# Geriatrics
df_train_geriatrics = pd.DataFrame()
df_train_geriatrics['body'] = df_geriatric['body'].append(df_geriatric['cycle_2_keywords'])
df_train_geriatrics['label'] = np.ceil(df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace('', 0))
df_train_geriatrics['weights'] = df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace(0, 0.001)
df_train_geriatrics = df_train_geriatrics.dropna()
df_train_geriatrics = df_train_geriatrics[(df_train_geriatrics.body != '') & (df_train_geriatrics.body != ' ')]
df_train_geriatrics = df_train_geriatrics.drop_duplicates('body')

# Spec Pop
df_train_specpop = pd.DataFrame()
df_train_specpop['body'] = df_specpop['body'].append(df_specpop['cycle_2_keywords'])
df_train_specpop['label'] = np.ceil(df_specpop['FLAG'].append(df_specpop['FLAG']).replace('', 0))
df_train_specpop['weights'] = df_specpop['FLAG'].append(df_specpop['FLAG']).replace(0, 0.001)
df_train_specpop = df_train_specpop.dropna()
df_train_specpop = df_train_specpop[(df_train_specpop.body != '') & (df_train_specpop.body != ' ')]
df_train_specpop = df_train_specpop.drop_duplicates('body')

# Other
df_train_other = pd.DataFrame()
df_train_other['body'] = df[df.title.isin(set.union(set(df_mnchn[df_mnchn.FLAG==0].AO), 
                 set(df_adolescent[df_adolescent.FLAG==0].AO),
                 set(df_geriatric[df_geriatric.FLAG==0].AO),
                 set(df_specpop[df_specpop.FLAG==0].AO)))]['body']
df_train_other['weights'] = 0.999


df_train_mnchn['label'] = 1
df_train_adolescent['label'] = 2
df_train_geriatrics['label'] = 3
df_train_specpop['label'] = 4
df_train_other['label'] = 5

df_train = df_train_mnchn.append(df_train_adolescent).append(df_train_geriatrics).append(df_train_specpop).append(df_train_other)

Classification Pipeline


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
 
stop_words = stopwords.words('english') + list(punctuation)
 
def tokenize(text):
    words = word_tokenize(text)
    words = [w.lower() for w in words]
    return [w for w in words if w not in stop_words and not w.isdigit()]
 

tfid_params = {
    'ngram_range': (1,4), 
    'strip_accents':'ascii',
    'tokenizer': tokenize
    
}

text_clf = Pipeline([
            ('vectorizer', TfidfVectorizer(**tfid_params)),
            ('clf', MultinomialNB())])

In [14]:
model_cycle_3 = text_clf.fit(df_train.body, df_train.label.values, 
                     **{'clf__sample_weight': df_train.weights})

In [15]:
results = pd.DataFrame(model_cycle_3.predict_proba(df.body), columns=themes+['other'])
results['AO'] = df.title

In [16]:
for theme in themes:
    r = results.sort_values(by=theme, ascending=False)[:100][['AO', theme]]
    r.to_csv(theme+'_cycle3_results.csv', index=False)

In [17]:
# mnchn = text_clf.fit(df_train_mnchn.body, df_train_mnchn.label.values, 
#                      **{'clf__sample_weight': df_train_mnchn.weights})
# specpop = text_clf.fit(df_train_specpop.body, df_train_specpop.label, 
#                        **{'clf__sample_weight': df_train_specpop.weights})
# adolescent = text_clf.fit(df_train_adolescent.body, df_train_adolescent.label, 
#                           **{'clf__sample_weight': df_train_adolescent.weights})
# geriatrics = text_clf.fit(df_train_geriatrics.body, df_train_geriatrics.label, 
#                           **{'clf__sample_weight': df_train_geriatrics.weights})

In [18]:
# df_mnchn_res = pd.DataFrame(mnchn.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_mnchn_res['AO'] = df.title
# df_specpop_res = pd.DataFrame(specpop.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_specpop_res['AO'] = df.title
# df_adolescent_res = pd.DataFrame(adolescent.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_adolescent_res['AO'] = df.title
# df_geriatric_res = pd.DataFrame(geriatrics.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_geriatric_res['AO'] = df.title

In [19]:
# a = df_mnchn_res.sort_values('irrelevance', ascending=False)[:30]
# # set.intersection(set(a.AO),set(df_mnchn_flag.AO))
# # set.difference( set(df_mnchn[df_mnchn.FLAG != 0].AO), set(a.AO))
# a

In [ ]: