In [16]:
%matplotlib inline
import matplotlib as mpl
mpl.rcParams['font.size'] = 16.0
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd # process data with pandas dataframe
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import time
import os
Some of the main features of the IPython Notebook app include:
Personal advantages for my projects:
The Ohsumed test collection (available at ftp://medir.ohsu.edu/pub/ohsumed) is a subset of the MEDLINE database, which is a bibliographic database of important, peer-reviewed medical literature maintained by the National Library of Medicine. The initial subset I consider in the project is the collection consisting of the first 20,000 documents from the 50,216 medical abstracts of the year 1991. The classification scheme consists of the 23 Medical Subject Headings (MeSH) categories of cardiovascular diseases group:
| Category | Description |
|---|---|
| C01 | Bacterial Infections and Mycoses |
| C02 | Virus Diseases |
| C03 | Parasitic Diseases |
| C04 | Neoplasms |
| C05 | Musculoskeletal Diseases |
| C06 | Digestive System Diseases |
| C07 | Stomatognathic Diseases |
| C08 | Respiratory Tract Diseases |
| C09 | Otorhinolaryngologic Diseases |
| C10 | Nervous System Diseases |
| C11 | Eye Diseases |
| C12 | Urologic and Male Genital Diseases |
| C13 | Female Genital Diseases and Pregnancy Complications |
| C14 | Cardiovascular Diseases |
| C15 | Hemic and Lymphatic Diseases |
| C16 | Neonatal Diseases and Abnormalities |
| C17 | Skin and Connective Tissue Diseases |
| C18 | Nutritional and Metabolic Diseases |
| C19 | Endocrine Diseases |
| C20 | Immunologic Diseases |
| C21 | Disorders of Environmental Origin |
| C22 | Animal Diseases |
| C23 | Pathological Conditions, Signs and Symptoms |
The following code assumes that the data is extracted into folder with the name 'Data' in the same folder of the IPython notebook
The following code iterates over the extracted data and converts the ohsumed directory structure to a single sqlite databse with the data. The original dataset is already divided to Test and Training datasets. We will keep this division.
In [29]:
import sqlite3 as sqlite
def get_all_tables(c):
"""
Helper function - Gets a list of all the tables in the database.
"""
all_tables = []
c.execute('SELECT name FROM SQLITE_MASTER WHERE type = "table"')
for tbl in c:
all_tables.append(tbl[0])
return all_tables
def drop_tables(c, tables):
"""
Helper function - Checks that the specified tables exist, and for those that do, drop them.
"""
all_tables = get_all_tables(c)
for t in tables:
if t in all_tables:
c.execute('DROP TABLE %s' % t)
def create_documents_table(c):
"""
Helper function - This function uses SQL to create the Documents table
"""
drop_tables(c, [ 'Documents' ])
c.execute("""CREATE TABLE Documents (
DOCID TEXT PRIMARY KEY,
NOTE_TEXT TEXT,
CATEGORY TEXT
)""")
def add_document(c, docid, text, category):
"""
Helper function - Adds one document to sql Documents database
"""
c.execute('insert or replace into Documents values ( ?, ?, ? )', (docid, text, category))
def ohsumed2sqlite(src_root_dir, dest_sqlite):
start_time = time.time()
print 'Converting ohsumed directory structure {0} to sqlite database'.format(src_root_dir)
conn_out = sqlite.connect(dest_sqlite)
c_out = conn_out.cursor()
fout = open(dest_sqlite, 'w')
fout.close()
create_documents_table(c_out)
# Process the ohsumed directory
dict = {}
for root, dirs, files in os.walk(src_root_dir):
for f in files:
category = os.path.basename(root) # directory name is the category
with open (os.path.join(root, f), "r") as cur_file:
data=cur_file.read()
if f in dict:
category = dict[f] + ',' + category
dict[f] = category
add_document(c_out, f, data, category)
conn_out.commit()
c_out.close()
print("--- ohsumed2sqlite %s seconds ---" % (time.time() - start_time))
In [30]:
# Convert the training data
ohsumed2sqlite('.\\Data\\ohsumed-first-20000-docs\\training', 'training.sqlite')
# Convert the test data
ohsumed2sqlite('.\\Data\\ohsumed-first-20000-docs\\test', 'test.sqlite')
In [64]:
con = sqlite.connect('training.sqlite')
df_train = pd.read_sql_query("SELECT * from Documents", con)
con.close()
con = sqlite.connect('test.sqlite')
df_test = pd.read_sql_query("SELECT * from Documents", con)
con.close()
In [65]:
pd.options.display.max_colwidth = 200
df_train.sample(20)
Out[65]:
As we see, each document is assigned one class or more. In this exercise I would like to implement a simple binary classification workflow, so I define 2 binary classes:
The following code converts that data to a binary classification dataset:
In [66]:
import re
def ConvertCategoryColToBinVal(df, poscat):
df['CATEGORY'] = df['CATEGORY'].apply(lambda x: bool(re.search(poscat, x, re.IGNORECASE) and True))
ConvertCategoryColToBinVal(df_train, 'C14')
ConvertCategoryColToBinVal(df_test, 'C14')
Let's take another look at the data:
In [67]:
df_train.sample(20)
Out[67]:
In [72]:
print 'Training data has: ', len(df_train.index), ' documents'
print 'Test data has: ', len(df_test.index), ' documents'
In [73]:
plt.axis('equal')
plt.pie(
df_train.CATEGORY.value_counts().tolist(),
labels=['False', 'True'],
autopct='%1.1f%%',
colors=("#E13F29", "#D69A80"));
The accuracy of a dumb classifier that classifies all the documents as False is >80%
In [79]:
import re
def baseline_cpr_classifier(txt):
if re.search('Cardio', txt, re.IGNORECASE):
return True
else:
return False
In [90]:
Ytrue = df_test.CATEGORY.tolist()
Ypred = []
for index, row in df_test.iterrows():
Ypred.append(baseline_cpr_classifier(row['NOTE_TEXT']))
In [91]:
dictY = {'Actual' : Ytrue, 'Predicted': Ypred}
dfY = pd.DataFrame.from_dict(dictY)
dfY.sample(20)
Out[91]:
In [92]:
from ipy_table import *
confusion_matrix_binary = [
['', 'Predicted 0', 'Predicted 1'],
['Actual 0', 'True Negative', 'False Positive'],
['Actual 1', 'False Negative', 'True Positive']
]
make_table(confusion_matrix_binary)
apply_theme('basic_both')
Out[92]:
In [93]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(Ytrue, Ypred)
print conf_mat
print sum(conf_mat[0])
Some code to prettify the printout of the confusion matrix in the notebook:
In [87]:
def render_confusion_matrix(ytrue, ypred):
return pd.crosstab(pd.Series(ytrue), pd.Series(ypred), rownames=['Actual'], colnames=['Predicted'], margins=True)
In [88]:
render_confusion_matrix(Ytrue, Ypred)
Out[88]:
In [89]:
from sklearn.metrics import classification_report, accuracy_score
print
print 'Accuracy: ', accuracy_score(Ytrue, Ypred)
print
print classification_report(Ytrue, Ypred)
Even though the accuracy is OK, the recall is misearable. Let's build a real classifier
Our vector space is a dictionary of all the N-grams in our set of documents.
In [99]:
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import string
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
text = "".join([ch for ch in text if ch not in string.punctuation])
tokens = word_tokenize(text)
tokens = [item for item in tokens if item.isalpha()]
stems = stem_tokens(tokens, stemmer)
return stems
In [100]:
X_train = df_train['NOTE_TEXT'].tolist()
Y_train = df_train.CATEGORY.tolist()
vectorizer = CountVectorizer(tokenizer=tokenize, ngram_range=(1,2))
wcounts = vectorizer.fit_transform(X_train)
In [102]:
wcounts
Out[102]:
We have 6286 documents in our training set, each has 355860 features (the combination of all the uni/bi-grams)
In [103]:
feats = vectorizer.get_feature_names()
feats[:200]
Out[103]:
At this point of the workflow, there are many feature selection and engineering techniques we could apply. Some generic, some based on natural language processing and some unique to the medical domain. However, to keep it simple for now, let's move on to the classifier:
$\implies P(Y|w_{1}, w_{2}, ..., w_{N})=\frac{1}{P(W)} \cdot P(Y) \cdot \prod_{k=1}^{N} P(w_{k}|Y)$
Since the probability of all the features is not dependent on the probability of Y and all we care about is finding the Y that is most likely, we can drop $\frac{1}{P(W)}$ and stay with:
$P(Y|w_{1}, w_{2}, ..., w_{N})=P(Y) \cdot \prod_{k=1}^{N} P(w_{k}|Y)$
We estimate the probabilities from the training set using multinomial distribution with a smoothing factor alpha:
$P(w_{i}) = \frac{count_{i}+alpha}{overallcount_{i}+alpha \cdot N}$
In [111]:
from sklearn.naive_bayes import MultinomialNB
clf_nb = MultinomialNB(alpha=0.1)
clf_nb.fit(wcounts, Y_train)
Out[111]:
A cool feature of the Naive Bayes classifier is that it can list for us most relevant features for each class. These are the features that are most relevant to the positive documents. Some of them are trivial English words, that will also appear in the list of features relevant to the Negative documents. However - we also see some domain specific features, such as arteri and coronari :
In [112]:
pf = [(clf_nb.feature_log_prob_[1, i], feats[i]) for i in range(len(feats))]
pf.sort(reverse=True)
for p in pf[:25]:
print 'Positive word %.2f: %s' % (p[0], p[1])
In [117]:
# Read the data from pandas dataframe to an array
X_test = df_test['NOTE_TEXT'].tolist()
Y_test = df_test.CATEGORY.tolist()
# Convert the text to arrays of numbers
counts_test = vectorizer.transform(X_test)
In [118]:
counts_test
Out[118]:
We have 7643 documents in our test set, each has 355860 features - exactly the same features we set while processing the training set of course.
In [119]:
# Predict the values of the test set
predictions = clf_nb.predict(counts_test)
In [120]:
# Look at the first 20 predictions
predictions[:20]
Out[120]:
In [122]:
render_confusion_matrix(Y_test, predictions)
Out[122]:
In [124]:
print
print 'Accuracy: ', accuracy_score(Y_test, predictions)
print
print classification_report(Y_test, predictions)
As expected we improved the accuracy compared to the baseline classifier, and dramatically improved the recall on the positive documents - from 29% to 78%.
Before we move on to optimize the classifier, let's look at some other interesting output type of Naive Bayes - some insightful output besides the predictions on the test data
In [138]:
iwrong_predictions = [i for i,v in enumerate(zip(Y_test, predictions)) if v[0] != v[1]]
iwrong_predictions[:20]
Out[138]:
In [139]:
proba = clf_nb.predict_proba(counts_test)
log_proba = clf_nb.predict_log_proba(counts_test)
In [140]:
proba[:10]
Out[140]:
The output of clf_nb.predict_proba(counts_test) is a (N example, 2) array. The first column gives the probability $P(Y=0)$ or $P(False)$, and the second gives $P(Y=1)$ or $P(True)$.
Commonly it is more comfortable to work with the log values of the probabilities - the results of clf_nb.predict_log_proba
In [141]:
diff_prob = proba[:,1] - proba[:,0]
diff_log_proba = log_proba[:,1] - log_proba[:,0]
In [142]:
print 'diff_prob:\n'
print 'mean:',np.mean(diff_prob)
print 'std:', np.std(diff_prob)
print '\ndiff_log_prob:\n'
print 'mean:', np.mean(diff_log_proba)
print 'std:', np.std(diff_log_proba)
In [143]:
diff_log_proba[-10:]
Out[143]:
In [149]:
# Plot histogram.
plt.hist(diff_log_proba, range=[-500, 500], bins=30, normed=True, alpha=0.5)
plt.axvline(0, color='r')
Out[149]:
In [151]:
pospts = [v[1] for i,v in enumerate(zip(Y_test, diff_log_proba)) if v[0] == 1]
negpts = [v[1] for i,v in enumerate(zip(Y_test, diff_log_proba)) if v[0] == 0]
In [202]:
import random
plt.scatter(
pospts,
np.random.uniform(0.9, 1.1, len(pospts)),
color='blue')
plt.scatter(
negpts,
np.random.uniform(0.9, 1.1, len(negpts)),
color='red')
plt.xlim(-500, 450)
plt.ylim(0.8, 1.2)
plt.axvline(0, color='r')
values = np.array(diff_log_proba[iwrong_predictions])
plt.axvspan(
np.mean(values)-2*np.std(values),
np.mean(values)+2*np.std(values),
facecolor='b', alpha=0.1)
plt.axvline(np.mean(values), linewidth=1);
When optimizing parameters for classification pipeline we can write or define our own scoring function. The optimizing routine GridSearchCV which runs a brute force parameter search, will select the parameters that got the highest score.
When classifing medical documents, we sometimes want to maximize sensitivity (recall) while keeping specificity and accuracy in check. In other words - the sensitivity is our target optimization parameter. To maximize sensitivity, I'll define a scorer function that returns the recall score.
In [199]:
from sklearn.metrics import recall_score, make_scorer
# Define scorer function that returns the recall score
recall_scorer = make_scorer(recall_score)
Now we build a full pipeline and use the GridSearchCV sklearn utility to fit the best paramaters for the pipeline. Normally I use longer lists of parameters, but that takes hours to run.
NOTE - we will use GridSearchCV default 3-fold cross-validation. Other cross-validation schemes can be defined if needed.
In [201]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('clf', MultinomialNB())
])
# Define some possible parameters for feature extraction and for the classifier
parameters = {
'vect__max_features': (20000, 30000),
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
'clf__alpha': (0.05, 0.1, 0.2)
}
# find the best parameters for both the feature extraction and the
# classifier, based on the scorer function we defined
grid_search = GridSearchCV(pipeline, parameters, verbose=1, scoring=recall_scorer)
grid_search.fit(df_train['NOTE_TEXT'].tolist(), df_train.CATEGORY.tolist())
Out[201]:
In [203]:
print("Best parameters set found on development set:")
print
print(grid_search.best_params_)
In [204]:
# Use the optimal classifier to make predictions on the test set
opt_predictions = grid_search.predict(df_test['NOTE_TEXT'].tolist())
In [205]:
render_confusion_matrix(df_test.CATEGORY.tolist(), opt_predictions)
Out[205]:
In [206]:
print
print 'Accuracy: ', accuracy_score(df_test.CATEGORY.tolist(), opt_predictions)
print
print classification_report(df_test.CATEGORY.tolist(), opt_predictions)
Comparing the performance of the optimized classifier to the performance of the the classifier with the ad-hoc parameters shows, that while the accuracy of the optimized classifier is a bit lower (91% compared to 92%), the sensitivity for True documents, which is our target performance parameter, increased from 78% to 82%.
In [ ]: