In [1]:
%pylab inline
import pandas as pd
In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
In [3]:
themes = ['mnchn', 'adolescent', 'geriatrics', 'specpop']
keywords = {
theme: pd.read_excel('data/cycle_1/'+theme+'.xlsx', usecols=['AO', 'Final Keywords'], dtype=
{'Keywords': str}).fillna(method='ffill') for theme in themes
}
keywords_final = {}
for theme in themes:
keywords[theme]['Final Keywords'] += ','
keywords_final[theme] = keywords[theme].groupby('AO').sum()['Final Keywords'].str.replace(
'"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '')
In [5]:
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Document(Base):
__tablename__ = 'document'
id = Column(Integer, primary_key=True)
title = Column(psql.TEXT)
date = Column(DATE)
doctype = Column(psql.TEXT)
docnum = Column(psql.TEXT)
subject = Column(psql.TEXT)
body = Column(psql.TEXT)
sign = Column(psql.TEXT)
signtitle = Column(psql.TEXT)
images = Column(psql.JSONB)
raw_json = Column(psql.JSONB)
def __repr__(self):
return self.title
In [6]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
In [7]:
query = 'SELECT title, body FROM Document'
In [8]:
df = pd.read_sql_query(query, engine)
In [9]:
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
In [10]:
df_mnchn = df[df.title.isin(keywords_final['mnchn'].index.values)]
df_adolescent = df[df.title.isin(keywords_final['adolescent'].index.values)]
df_geriatric = df[df.title.isin(keywords_final['geriatrics'].index.values)]
df_specpop = df[df.title.isin(keywords_final['specpop'].index.values)]
In [11]:
df_mnchn = df.merge(keywords_final['mnchn'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_adolescent = df.merge(keywords_final['adolescent'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_geriatric = df.merge(keywords_final['geriatrics'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
df_specpop = df.merge(keywords_final['specpop'].reset_index(), left_on='title', right_on='AO').drop('title', axis=1)
In [11]:
print(set.difference(set(keywords_final['specpop'].index), set(df_specpop.AO)))
print(set.difference(set(keywords_final['geriatrics'].index), set(df_geriatric.AO)))
print(set.difference(set(keywords_final['adolescent'].index), set(df_adolescent.AO)))
print(set.difference(set(keywords_final['mnchn'].index), set(df_mnchn.AO)))
Labels:
In [12]:
df_train = pd.DataFrame()
# MNCHN
df_train['body'] = df_mnchn['body'].append(df_mnchn['Final Keywords'])
df_train['label'] = 1
# Adolescent
df_train = df_train.append(pd.DataFrame({
'body': df_adolescent['body'].append(df_adolescent['Final Keywords']),
'label': 2
}))
# Geriatrics
df_train = df_train.append(pd.DataFrame({
'body': df_geriatric['body'].append(df_geriatric['Final Keywords']),
'label': 3
}))
# Special Populations
df_train = df_train.append(pd.DataFrame({
'body': df_specpop['body'].append(df_specpop['Final Keywords']),
'label': 4
}))
df_train.reset_index(drop=True, inplace=True)
# Other Disregard atm
# df_train = df_train.append(pd.DataFrame({
# 'body': df_specpop['body'].append(df_specpop['Final Keywords']),
# 'label': 4
# set.difference(set(df.title),set(df_mnchn.AO).union(
# set(df_adolescent.AO)).union(
# set(df_geriatric.AO)).union(
# set(df_specpop.AO)))
# }))
In [13]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
tfid_params = {
'stop_words':stopwords.words(),
'ngram_range': (1,4),
'strip_accents':'ascii',
}
text_clf = Pipeline([('vect_tfid', TfidfVectorizer(**tfid_params)),
('clf', MultinomialNB()),
])
In [14]:
model_cycle_1 = text_clf.fit(df_train.body, df_train.label)
In [15]:
results = pd.DataFrame(model_cycle_1.predict_proba(df.body), columns=themes)
results['AO'] = df.title
In [16]:
for theme in themes:
results.sort_values(by=theme, ascending=False)[:40][['AO', theme]].to_csv(theme+'_cycle1_results.csv', index=False)
In [14]:
df_mnchn.drop('body', axis=1).to_csv('mnchn_cycle1_keywords.csv', index=False)
df_geriatric.drop('body', axis=1).to_csv('geriatric_cycle1_keywords.csv', index=False)
df_adolescent.drop('body', axis=1).to_csv('adolescent_cycle1_keywords.csv', index=False)
df_specpop.drop('body', axis=1).to_csv('specpop_cycle1_keywords.csv', index=False)
In [ ]: