In [1]:
%pylab inline
import pandas as pd
In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
In [3]:
themes = ['mnchn', 'adolescent', 'geriatrics', 'specpop']
keywords = {
theme: pd.read_csv('data/cycle_2/'+theme+'_keywords.csv', dtype=
{'Keywords': str}).fillna(method='ffill') for theme in themes
}
flags = {
theme: pd.read_csv('data/cycle_2/'+theme+'_flagged.csv').fillna(method='ffill') for theme in themes
}
keywords_final = {}
for theme in themes:
keywords[theme].columns = keywords[theme].columns.str.strip()
keywords[theme]['cycle_2_keywords'] += ','
keywords_final[theme] = keywords[theme].groupby('AO').sum()['cycle_2_keywords'].str.replace(
'"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '')
In [4]:
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Document(Base):
__tablename__ = 'document'
id = Column(Integer, primary_key=True)
title = Column(psql.TEXT)
date = Column(DATE)
doctype = Column(psql.TEXT)
docnum = Column(psql.TEXT)
subject = Column(psql.TEXT)
body = Column(psql.TEXT)
sign = Column(psql.TEXT)
signtitle = Column(psql.TEXT)
images = Column(psql.JSONB)
raw_json = Column(psql.JSONB)
def __repr__(self):
return self.title
In [5]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
In [6]:
query = 'SELECT title, body FROM Document'
In [7]:
df = pd.read_sql_query(query, engine)
In [8]:
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
In [9]:
df_mnchn_flag = flags['mnchn'].merge(keywords_final['mnchn'].reset_index(), on='AO', how='outer').drop('mnchn', axis=1)
df_mnchn_flag['FLAG'] = df_mnchn_flag['FLAG'].fillna(1)
df_adolescent_flag = flags['adolescent'].merge(keywords_final['adolescent'].reset_index(), on='AO', how='outer').drop('adolescent', axis=1)
df_adolescent_flag['FLAG'] = df_adolescent_flag['FLAG'].fillna(1)
df_geriatric_flag = flags['geriatrics'].merge(keywords_final['geriatrics'].reset_index(), on='AO', how='outer').drop('geriatrics', axis=1)
df_geriatric_flag['FLAG'] = df_geriatric_flag['FLAG'].fillna(1)
df_specpop_flag = flags['specpop'].merge(keywords_final['specpop'].reset_index(), on='AO', how='outer').drop('specpop', axis=1)
df_specpop_flag['FLAG'] = df_specpop_flag['FLAG'].fillna(1)
In [10]:
df_mnchn = df_mnchn_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_adolescent = df_adolescent_flag.merge(df,left_on='AO', right_on='title').drop('title', axis=1)
df_geriatric = df_geriatric_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_specpop = df_specpop_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
In [11]:
print(set.difference(set(df_specpop_flag.AO), set(df_specpop.AO)))
print(set.difference(set(df_geriatric_flag.AO), set(df_geriatric.AO)))
print(set.difference(set(df_adolescent_flag.AO), set(df_adolescent.AO)))
print(set.difference(set(df_mnchn_flag.AO), set(df_mnchn.AO)))
Labels:
Reviewer feedback is integrated via weights.
Need for sampling of documents not included in list as assumed label 0 with 0.5 weight.
In [12]:
seed = 427
# MNCHN
df_train_mnchn = pd.DataFrame()
df_train_mnchn['body'] = df_mnchn['body'].append(df_mnchn['cycle_2_keywords'])
df_train_mnchn['label'] = np.ceil(df_mnchn['FLAG'].append(df_mnchn['FLAG']))
df_train_mnchn['weights'] = df_mnchn['FLAG'].append(df_mnchn['FLAG'])
df_train_mnchn = df_train_mnchn.dropna()
df_train_mnchn = df_train_mnchn[df_train_mnchn['label'] != 0]
df_train_mnchn['label'] = 1
# Adolescent
df_train_adolescent = pd.DataFrame()
df_train_adolescent['body'] = df_adolescent['body'].append(df_adolescent['cycle_2_keywords'])
df_train_adolescent['label'] = np.ceil(df_adolescent['FLAG'].append(df_adolescent['FLAG']))
df_train_adolescent['weights'] = df_adolescent['FLAG'].append(df_adolescent['FLAG'])
df_train_adolescent = df_train_adolescent.dropna()
df_train_adolescent = df_train_adolescent[df_train_adolescent['label'] != 0]
df_train_adolescent['label'] = 2
# Geriatrics
df_train_geriatrics = pd.DataFrame()
df_train_geriatrics['body'] = df_geriatric['body'].append(df_geriatric['cycle_2_keywords'])
df_train_geriatrics['label'] = np.ceil(df_geriatric['FLAG'].append(df_geriatric['FLAG']))
df_train_geriatrics['weights'] = df_geriatric['FLAG'].append(df_geriatric['FLAG'])
df_train_geriatrics = df_train_geriatrics.dropna()
df_train_geriatrics = df_train_geriatrics[df_train_geriatrics['label'] != 0]
df_train_geriatrics['label'] = 3
# Spec Pop
df_train_specpop = pd.DataFrame()
df_train_specpop['body'] = df_specpop['body'].append(df_specpop['cycle_2_keywords'])
df_train_specpop['label'] = np.ceil(df_specpop['FLAG'].append(df_specpop['FLAG']))
df_train_specpop['weights'] = df_specpop['FLAG'].append(df_specpop['FLAG'])
df_train_specpop = df_train_specpop.dropna()
df_train_specpop = df_train_specpop[df_train_specpop['label'] != 0]
df_train_specpop['label'] = 4
df_train = df_train_mnchn.append(df_train_adolescent).append(df_train_geriatrics).append(df_train_specpop)
In [13]:
# seed = 427
# # MNCHN
# df_train_mnchn = pd.DataFrame()
# df_train_mnchn['body'] = df_mnchn['body'].append(df_mnchn['cycle_2_keywords'])
# df_train_mnchn['label'] = np.ceil(df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace('', 0))
# df_train_mnchn['weights'] = df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace('', 1)
# df_train_mnchn = df_train_mnchn.dropna()
# df_train_mnchn = df_train_mnchn.append(pd.DataFrame({
# 'body': df[~df.title.isin(df_mnchn.AO)].sample(30, random_state=seed).body.values,
# 'label': 0,
# 'weights': 0.5
# }))
# # Geriatrics
# df_train_geriatrics = pd.DataFrame()
# df_train_geriatrics['body'] = df_geriatric['body'].append(df_geriatric['cycle_2_keywords'])
# df_train_geriatrics['label'] = np.ceil(df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace('', 0))
# df_train_geriatrics['weights'] = df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace('', 1)
# df_train_geriatrics = df_train_geriatrics[df_train_geriatrics.body != '']
# df_train_geriatrics = df_train_geriatrics.dropna()
# df_train_geriatrics = df_train_geriatrics.append(pd.DataFrame({
# 'body': df[~df.title.isin(df_geriatric.AO)].sample(30, random_state=seed).body.values,
# 'label': 0,
# 'weights': 0.5
# }))
# # Spec Pop
# df_train_specpop = pd.DataFrame()
# df_train_specpop['body'] = df_specpop['body'].append(df_specpop['cycle_2_keywords'])
# df_train_specpop['label'] = np.ceil(df_specpop['FLAG'].append(df_specpop['FLAG']).replace('', 0))
# df_train_specpop['weights'] = df_specpop['FLAG'].append(df_specpop['FLAG']).replace('', 1)
# df_train_specpop = df_train_specpop.dropna()
# df_train_specpop = df_train_specpop.append(pd.DataFrame({
# 'body': df[~df.title.isin(df_specpop.AO)].sample(30, random_state=seed).body.values,
# 'label': 0,
# 'weights': 0.5
# }))
# # Adolescent
# df_train_adolescent = pd.DataFrame()
# df_train_adolescent['body'] = df_adolescent['body'].append(df_adolescent['cycle_2_keywords'])
# df_train_adolescent['label'] = np.ceil(df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace('', 0))
# df_train_adolescent['weights'] = df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace('', 1)
# df_train_adolescent = df_train_adolescent.dropna()
# df_train_adolescent = df_train_adolescent.append(pd.DataFrame({
# 'body': df[~df.title.isin(df_adolescent.AO)].sample(30, random_state=seed).body.values,
# 'label': 0,
# 'weights': 0.5
# }))
In [14]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
tfid_params = {
'stop_words':stopwords.words(),
'ngram_range': (1,4),
'strip_accents':'ascii',
}
text_clf = Pipeline([('vect_tfid', TfidfVectorizer(**tfid_params)),
('clf', MultinomialNB()),
])
In [15]:
model_cycle_2 = text_clf.fit(df_train.body, df_train.label.values,
**{'clf__sample_weight': df_train.weights})
In [16]:
results = pd.DataFrame(model_cycle_2.predict_proba(df.body), columns=themes)
results['AO'] = df.title
In [17]:
for theme in themes:
r = results.sort_values(by=theme, ascending=False)[:60][['AO', theme]]
a = set.difference(set(r.AO),
set(keywords_final[theme].index))
r[r.AO.isin(a)].to_csv(theme+'_cycle2_results.csv', index=False)
In [18]:
# mnchn = text_clf.fit(df_train_mnchn.body, df_train_mnchn.label.values,
# **{'clf__sample_weight': df_train_mnchn.weights})
# specpop = text_clf.fit(df_train_specpop.body, df_train_specpop.label,
# **{'clf__sample_weight': df_train_specpop.weights})
# adolescent = text_clf.fit(df_train_adolescent.body, df_train_adolescent.label,
# **{'clf__sample_weight': df_train_adolescent.weights})
# geriatrics = text_clf.fit(df_train_geriatrics.body, df_train_geriatrics.label,
# **{'clf__sample_weight': df_train_geriatrics.weights})
In [19]:
# df_mnchn_res = pd.DataFrame(mnchn.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_mnchn_res['AO'] = df.title
# df_specpop_res = pd.DataFrame(specpop.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_specpop_res['AO'] = df.title
# df_adolescent_res = pd.DataFrame(adolescent.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_adolescent_res['AO'] = df.title
# df_geriatric_res = pd.DataFrame(geriatrics.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_geriatric_res['AO'] = df.title
In [20]:
# a = df_mnchn_res.sort_values('relevance', ascending=False)[:40]
# # set.intersection(set(a.AO),set(df_mnchn_flag.AO))
# a
In [23]:
df_mnchn.drop(['body', 'FLAG'], axis=1).to_csv('mnchn_cycle2_keywords.csv', index=False)
df_geriatric.drop(['body', 'FLAG'], axis=1).to_csv('geriatric_cycle2_keywords.csv', index=False)
df_adolescent.drop(['body', 'FLAG'], axis=1).to_csv('adolescent_cycle2_keywords.csv', index=False)
df_specpop.drop(['body', 'FLAG'], axis=1).to_csv('specpop_cycle2_keywords.csv', index=False)
In [ ]: