In [1]:
%pylab inline
import pandas as pd
In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
In [3]:
themes = ['mnchn', 'adolescent', 'geriatrics', 'specpop']
keywords = {
theme: pd.read_csv('data/cycle_3/'+theme+'_keywords.csv', dtype=
{'Keywords': str}).fillna(method='ffill') for theme in themes
}
flags = {
theme: pd.read_csv('data/cycle_3/'+theme+'_flagged.csv').fillna(method='ffill') for theme in themes
}
keywords_final = {}
for theme in themes:
keywords[theme].columns = keywords[theme].columns.str.strip()
keywords[theme]['cycle_2_keywords'] += ','
keywords_final[theme] = keywords[theme].groupby('AO').sum()['cycle_2_keywords'].str.replace(
'"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '')
In [4]:
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Document(Base):
__tablename__ = 'document'
id = Column(Integer, primary_key=True)
title = Column(psql.TEXT)
date = Column(DATE)
created = Column(DATE, default=datetime.datetime.now)
modified = Column(DATE, default=datetime.datetime.now)
doctype = Column(psql.TEXT)
docnum = Column(psql.TEXT)
subject = Column(psql.TEXT)
body = Column(psql.TEXT)
sign = Column(psql.TEXT)
signtitle = Column(psql.TEXT)
images = Column(psql.JSONB)
raw_json = Column(psql.JSONB)
def __repr__(self):
return self.title
In [5]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
In [6]:
query = 'SELECT title, body FROM Document'
In [7]:
df = pd.read_sql_query(query, engine)
In [8]:
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
In [9]:
df_mnchn_flag = flags['mnchn'].merge(keywords_final['mnchn'].reset_index(), on='AO', how='outer').drop('mnchn', axis=1)
df_mnchn_flag['FLAG'] = df_mnchn_flag['FLAG'].fillna(1)
df_adolescent_flag = flags['adolescent'].merge(keywords_final['adolescent'].reset_index(), on='AO', how='outer').drop('adolescent', axis=1)
df_adolescent_flag['FLAG'] = df_adolescent_flag['FLAG'].fillna(1)
df_geriatric_flag = flags['geriatrics'].merge(keywords_final['geriatrics'].reset_index(), on='AO', how='outer').drop('geriatrics', axis=1)
df_geriatric_flag['FLAG'] = df_geriatric_flag['FLAG'].fillna(1)
df_specpop_flag = flags['specpop'].merge(keywords_final['specpop'].reset_index(), on='AO', how='outer').drop('specpop', axis=1)
df_specpop_flag['FLAG'] = df_specpop_flag['FLAG'].fillna(1)
In [10]:
df_mnchn = df_mnchn_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_adolescent = df_adolescent_flag.merge(df,left_on='AO', right_on='title').drop('title', axis=1)
df_geriatric = df_geriatric_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
df_specpop = df_specpop_flag.merge(df, left_on='AO', right_on='title').drop('title', axis=1)
In [11]:
print(set.difference(set(df_specpop_flag.AO), set(df_specpop.AO)))
print(set.difference(set(df_geriatric_flag.AO), set(df_geriatric.AO)))
print(set.difference(set(df_adolescent_flag.AO), set(df_adolescent.AO)))
print(set.difference(set(df_mnchn_flag.AO), set(df_mnchn.AO)))
Labels:
Reviewer feedback is integrated via weights where FLAG is to weight:
In [12]:
seed = 427
# MNCHN
df_train_mnchn = pd.DataFrame()
df_train_mnchn['body'] = df_mnchn['body'].append(df_mnchn['cycle_2_keywords'])
df_train_mnchn['label'] = np.ceil(df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace('', 0))
df_train_mnchn['weights'] = df_mnchn['FLAG'].append(df_mnchn['FLAG']).replace(0, 0.001)
df_train_mnchn = df_train_mnchn.dropna()
df_train_mnchn = df_train_mnchn[(df_train_mnchn.body != '') & (df_train_mnchn.body != ' ')]
df_train_mnchn = df_train_mnchn.drop_duplicates('body')
# Adolescent
df_train_adolescent = pd.DataFrame()
df_train_adolescent['body'] = df_adolescent['body'].append(df_adolescent['cycle_2_keywords'])
df_train_adolescent['label'] = np.ceil(df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace('', 0))
df_train_adolescent['weights'] = df_adolescent['FLAG'].append(df_adolescent['FLAG']).replace(0, 0.001)
df_train_adolescent = df_train_adolescent.dropna()
df_train_adolescent = df_train_adolescent[(df_train_adolescent.body != '') & (df_train_adolescent.body != ' ')]
df_train_adolescent = df_train_adolescent.drop_duplicates('body')
# Geriatrics
df_train_geriatrics = pd.DataFrame()
df_train_geriatrics['body'] = df_geriatric['body'].append(df_geriatric['cycle_2_keywords'])
df_train_geriatrics['label'] = np.ceil(df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace('', 0))
df_train_geriatrics['weights'] = df_geriatric['FLAG'].append(df_geriatric['FLAG']).replace(0, 0.001)
df_train_geriatrics = df_train_geriatrics.dropna()
df_train_geriatrics = df_train_geriatrics[(df_train_geriatrics.body != '') & (df_train_geriatrics.body != ' ')]
df_train_geriatrics = df_train_geriatrics.drop_duplicates('body')
# Spec Pop
df_train_specpop = pd.DataFrame()
df_train_specpop['body'] = df_specpop['body'].append(df_specpop['cycle_2_keywords'])
df_train_specpop['label'] = np.ceil(df_specpop['FLAG'].append(df_specpop['FLAG']).replace('', 0))
df_train_specpop['weights'] = df_specpop['FLAG'].append(df_specpop['FLAG']).replace(0, 0.001)
df_train_specpop = df_train_specpop.dropna()
df_train_specpop = df_train_specpop[(df_train_specpop.body != '') & (df_train_specpop.body != ' ')]
df_train_specpop = df_train_specpop.drop_duplicates('body')
# Other
df_train_other = pd.DataFrame()
df_train_other['body'] = df[df.title.isin(set.union(set(df_mnchn[df_mnchn.FLAG==0].AO),
set(df_adolescent[df_adolescent.FLAG==0].AO),
set(df_geriatric[df_geriatric.FLAG==0].AO),
set(df_specpop[df_specpop.FLAG==0].AO)))]['body']
df_train_other['weights'] = 0.999
df_train_mnchn['label'] = 1
df_train_adolescent['label'] = 2
df_train_geriatrics['label'] = 3
df_train_specpop['label'] = 4
df_train_other['label'] = 5
df_train = df_train_mnchn.append(df_train_adolescent).append(df_train_geriatrics).append(df_train_specpop).append(df_train_other)
In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = stopwords.words('english') + list(punctuation)
def tokenize(text):
words = word_tokenize(text)
words = [w.lower() for w in words]
return [w for w in words if w not in stop_words and not w.isdigit()]
tfid_params = {
'ngram_range': (1,4),
'strip_accents':'ascii',
'tokenizer': tokenize
}
text_clf = Pipeline([
('vectorizer', TfidfVectorizer(**tfid_params)),
('clf', MultinomialNB())])
In [14]:
model_cycle_3 = text_clf.fit(df_train.body, df_train.label.values,
**{'clf__sample_weight': df_train.weights})
In [15]:
results = pd.DataFrame(model_cycle_3.predict_proba(df.body), columns=themes+['other'])
results['AO'] = df.title
In [16]:
for theme in themes:
r = results.sort_values(by=theme, ascending=False)[:100][['AO', theme]]
r.to_csv(theme+'_cycle3_results.csv', index=False)
In [17]:
# mnchn = text_clf.fit(df_train_mnchn.body, df_train_mnchn.label.values,
# **{'clf__sample_weight': df_train_mnchn.weights})
# specpop = text_clf.fit(df_train_specpop.body, df_train_specpop.label,
# **{'clf__sample_weight': df_train_specpop.weights})
# adolescent = text_clf.fit(df_train_adolescent.body, df_train_adolescent.label,
# **{'clf__sample_weight': df_train_adolescent.weights})
# geriatrics = text_clf.fit(df_train_geriatrics.body, df_train_geriatrics.label,
# **{'clf__sample_weight': df_train_geriatrics.weights})
In [18]:
# df_mnchn_res = pd.DataFrame(mnchn.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_mnchn_res['AO'] = df.title
# df_specpop_res = pd.DataFrame(specpop.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_specpop_res['AO'] = df.title
# df_adolescent_res = pd.DataFrame(adolescent.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_adolescent_res['AO'] = df.title
# df_geriatric_res = pd.DataFrame(geriatrics.predict_proba(df.body), columns=['irrelevance', 'relevance'])
# df_geriatric_res['AO'] = df.title
In [19]:
# a = df_mnchn_res.sort_values('irrelevance', ascending=False)[:30]
# # set.intersection(set(a.AO),set(df_mnchn_flag.AO))
# # set.difference( set(df_mnchn[df_mnchn.FLAG != 0].AO), set(a.AO))
# a
In [ ]: