In [2]:
import pandas as pd
import nltk
import codecs
import unicodedata
import re
from copy import deepcopy
from pyUtil import easyPickle as pickle
from pyUtil import flattenList as flatten

Data Input


In [7]:
criteria_text = codecs.open('data/ct_criteria_colin.txt',
                            encoding="utf-8")
criteria_text = criteria_text.readlines()

Chunck sentences and tokens


In [8]:
#break sentences on '-'
criteria_text_sent = [re.split(' - ', line) for line in criteria_text]

#get sentence tokenizer
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

#run the sentence tokenizer over all the documents
def sent_token(text):
    sentence_groups = []
    for sent_group in text:
        group_holder = []
        for sent in sent_group:
            group_holder.append(sent_tokenizer.tokenize(sent))
        sentence_groups.append(group_holder)
        del group_holder
    return sentence_groups

criteria_text_sent = sent_token(criteria_text_sent)


#Flatten the documents to contain just a list of strings where each string is a sentence
def flatten_docs(text):
    result = []
    for doc in text:
        result.append(flatten.flatten(doc))
    return result

criteria_text_docs = flatten_docs(criteria_text_sent)

#create a list of all sentences
criteria_text_sents = flatten.flatten(criteria_text_docs)

#CREATING TOKENS

#patter for tokenizing
pattern = r'''(?x)    # set flag to allow verbose regexps
        ([A-Z]\.)+        # abbreviations, e.g. U.S.A
        | \w+([-‘]\w+)*        # words with optional internal hyphens
        | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
        | \.\.\.            # ellipsis...   
        | [][.,;"'?():\-_`]+  # these are separate tokens
        '''
#create tokens for the sentence list
criteria_text_sent_tokens = [nltk.regexp_tokenize(sent, pattern) for sent
                         in criteria_text_sents]

#use this for creating tokens for the documents
def doc_token(text):
    result = []
    for doc in text:
        doc_text = []
        for sent in doc:
            doc_text.append(nltk.regexp_tokenize(sent, pattern))
        result.append(doc_text)
    return result
#criteria_text_docs_token = doc_token(criteria_text_docs)

Tag tokens


In [9]:
#tag document structured criteria text
def doc_tagger_pos(text):
    result = []
    for doc in text:
        doc_text = []
        for sent in doc:
            doc_text.append(nltk.pos_tag(sent))
        result.append(doc_text)
    return result

#criteria_text_docs_tagged_pos = doc_tagger_pos(criteria_text_docs_token)

#tag sentence structured criteria text
criteria_text_sent_tag = []
for sent in criteria_text_sent_tokens:
    criteria_text_sent_tag.append(nltk.pos_tag(sent))

Save and load tagged corpus


In [10]:
#save tagged corpus
pickle.save_object(criteria_text_sent_tag,
                   'data/criteria_corpus_pos_tagged.pkl')

In [4]:
#load tagged corpus
criteria_text_sent_tag = pickle.open_object('data/criteria_corpus_pos_tagged.pkl')

Keyphrases for criteria


In [5]:
#imports
from nltk.util import ngrams
from nltk import FreqDist
import string
from nltk.corpus import stopwords

In [13]:
#remove stopwords and punctuation
def remove_punct(text):
    return [[word for word in sent if word[0] not in string.punctuation] for sent in text]
def remove_stop(text):
    return [[word for word in sent if word.lower() not in stopwords.words('english')] for sent in text]

#create non-tagged corpus
criteria_text_sent_tokens = [[w[0] for w in sent] for sent in criteria_text_sent_tag]
criteria_sents_no_stop = remove_punct(criteria_text_sent_tokens)
criteria_sents_no_stop = remove_stop(criteria_sents_no_stop)

Chunker Approach


In [33]:
def get_specific_sent(text, spec_words):

    specific_sents = []
    spec_words = map(lambda x: x.lower(), spec_words)
    for sent in text:
        for word in sent:
            if word[0].lower() in spec_words:
                specific_sents.append(sent)
                break
    return specific_sents

In [37]:
#create subsection of sentences to run the chunker on that contain cerain phrases
smoker_list = ['Non-smoker', 'smoker']
smoker_sents = get_specific_sent(criteria_text_sent_tag, smoker_list)
pregnancy_list = ['Pregnancy', 'pregnant']
pregnancy_sents = get_specific_sent(criteria_text_sent_tag, pregnancy_list)
birth_control_list = ['Birth control', 'contraception']
birth_control_sents = get_specific_sent(criteria_text_sent_tag, birth_control_list)
drug_list = ['Illicit drugs', 'Alcohol abuse', 'illegal', 'illicit']
drug_sents = get_specific_sent(criteria_text_sent_tag, drug_list)
heart_failure_list = ['Congestive Heart Failure', 'heart failure']
heart_failure_sents = get_specific_sent(criteria_text_sent_tag, heart_failure_list)
hiv_list = ['HIV', 'aids', 'human immunodeficiency virus']
hiv_sents = get_specific_sent(criteria_text_sent_tag, hiv_list)
allergy_list = ['Allergies', 'allergy', 'hypersensitivity']
allergy_sents = get_specific_sent(criteria_text_sent_tag, allergy_list)

In [40]:
term_sents_list = [smoker_sents, pregnancy_sents, birth_control_sents, drug_sents,
                   heart_failure_sents, hiv_sents, allergy_sents]
term_list = [smoker_list, pregnancy_list, birth_control_list, drug_list, heart_failure_list,
             hiv_list, allergy_list]

In [30]:
#get chunks
def chunker(tagged_corpus, chunk_reg):
    
    cp = nltk.RegexpParser(chunk_reg)
    
    results = []
    
    for sents in tagged_corpus:
        tree = cp.parse(sents)
        for subtree in tree.subtrees():
            if subtree.label() == 'CHUNK':
                results.append(subtree[:])
    return results

chunk_reg = r"""
                  CHUNK: {(<NN.*><POS>)?<RB>?<JJ.*>*<NN.*>+}
             """


def get_doc_desc(num, terms, text):
    print
    print 'For terms: ' + ', '.join(terms)
    for sent in [[word[0] for word in sent] for sent in text[:num]]:
        print ' '.join(sent)

In [41]:
for idx, term in enumerate(term_sents_list):
    chunks_dict_criteria = chunker(term, chunk_reg)
    get_doc_desc(20, term_list[idx], chunks_dict_criteria)


For terms: Non-smoker, smoker
Non-smoker
Current smoker
nicotine patches
gum
Current cigarette smoker
cigarettes day
smoking
year
Current smoker
subject
smoker
use
tobacco
nicotine
products
months
Screening
LDL
risk factor
LDL

For terms: Pregnancy, pregnant
PRIOR CONCURRENT THERAPY
Biologic therapy
Pregnancy
use
double barrier method
pregnancy
e
condom
diaphragm
cervical cap
positive pregnancy test
breast feeding
screening
Women
negative pregnancy test
Baseline Month
Female subjects
negative pregnancy
test
child

For terms: Birth control, contraception
Must
method
contraception
study
Female subjects
use
effective nonhormonal birth control methods
practicing
birth control methods
days
end
treatment period
Note
Estrogen-based hormonal contraception
Prezista
trial
women
Fertile patients
effective contraception PRIOR CONCURRENT THERAPY
Biologic therapy

For terms: Illicit drugs, Alcohol abuse, illegal, illicit
Known history
alcohol abuse
illicit drugs
steroids
alcoholic beverages day
Alcohol abuse
drug addiction
use
illegal drugs
history
drug abuse
illicit drug use
history
alcohol abuse
daily consumption
alcoholic drinks
day
years
alcohol
drugs

For terms: Congestive Heart Failure, heart failure

For terms: HIV, aids, human immunodeficiency virus
HIV
negative test
Subjects
laboratory abnormalities
Division
AIDS Table
Grading
Severity
Adult
Pediatric Adverse Events (" DAIDS grading table
accordance
normal ranges
trial
clinical laboratory
Subjects
HIV
signs
active Hepatitis B
Hepatitis C.
multiple sclerosis

For terms: Allergies, allergy, hypersensitivity
Known
hypersensitivity
vaccine components
vaccine
same substances
eruptions
drug allergies
food allergy
eczema
psoriasis
urticaria
opinion
investigator
contraindication
study enrollment
clinically significant allergy
hypersensitivity
excipients
medications
trial

Ngram Approach


In [43]:
#look at multigrams as well for the specific sentences
def multinNgram(n, text):
    '''This funciton loops through ngrams of length 1 to n.'''
    text = remove_punct(text)
    text = remove_stop(text)
    result = {}
    flat_list = flatten.flatten(text)
    for num in range(n, 0, -1):
        result[num] = []
        ngram = ngrams(flat_list, num)
        result[num] = [' '.join(gram) for gram in ngram]
    return result

def get_top_mulitgrams(multiGrams, terms, num):
    print 'For terms: ' + ', '.join(terms)
    for ngram in multiGrams:
        fd = FreqDist(multiGrams[ngram]).most_common(num)
        for key in fd:
            print key

In [45]:
for idx, term in enumerate(term_sents_list):
    multiGrams = multinNgram(4, [[word[0] for word in sent] for sent in term])
    get_top_mulitgrams(multiGrams, term_list[idx], 10)


For terms: Non-smoker, smoker
(u'smoker', 35)
(u'Current', 12)
(u'history', 11)
(u'years', 9)
(u'day', 8)
(u'pack', 8)
(u'smoking', 8)
(u'cigarettes', 8)
(u'10', 7)
(u'Non-smoker', 6)
(u'Current smoker', 9)
(u'pack years', 5)
(u'Exclusion Criteria', 4)
(u'per day', 4)
(u'1 year', 4)
(u'cigarettes day', 4)
(u'5 cigarettes', 3)
(u'smoker defined', 3)
(u'defined smoked', 3)
(u'6 months', 3)
(u'smoker defined smoked', 3)
(u'smoker Current smoker', 3)
(u'preceding 1 year', 2)
(u'6 months pack', 2)
(u'Current cigarette smoker', 2)
(u'year Current smoker', 2)
(u'5 cigarettes day', 2)
(u'defined smoked preceding', 2)
(u'non-smoker 18 years', 2)
(u'10 pack years', 2)
(u'defined smoked preceding 1', 2)
(u'smoked preceding 1 year', 2)
(u'Current smoker defined smoked', 2)
(u'smoker defined smoked preceding', 2)
(u'18 years age older', 2)
(u'non-smoker 18 years age', 2)
(u'1 year Current smoker', 2)
(u'within last 2 years', 1)
(u'containing products within 6', 1)
(u'within past year prior', 1)
For terms: Pregnancy, pregnant
(u'pregnant', 679)
(u'pregnancy', 673)
(u'test', 451)
(u'study', 341)
(u'potential', 336)
(u'Pregnant', 314)
(u'women', 314)
(u'must', 267)
(u'negative', 235)
(u'lactating', 217)
(u'pregnancy test', 430)
(u'childbearing potential', 192)
(u'pregnant nursing', 126)
(u'must negative', 113)
(u'become pregnant', 110)
(u'urine pregnancy', 100)
(u'pregnant lactating', 100)
(u'breast feeding', 99)
(u'negative pregnancy', 99)
(u'potential must', 97)
(u'urine pregnancy test', 95)
(u'negative pregnancy test', 91)
(u'Negative pregnancy test', 84)
(u'potential must negative', 79)
(u'childbearing potential must', 69)
(u'serum pregnancy test', 63)
(u'nursing Negative pregnancy', 59)
(u'pregnant nursing Negative', 59)
(u'Women childbearing potential', 50)
(u'must negative pregnancy', 49)
(u'nursing Negative pregnancy test', 59)
(u'pregnant nursing Negative pregnancy', 59)
(u'childbearing potential must negative', 58)
(u'must negative pregnancy test', 48)
(u'negative urine pregnancy test', 46)
(u'potential must negative pregnancy', 39)
(u'Women childbearing potential must', 33)
(u'negative serum pregnancy test', 31)
(u'potential must negative serum', 21)
(u'must negative serum pregnancy', 21)
For terms: Birth control, contraception
(u'contraception', 511)
(u'use', 296)
(u'must', 272)
(u'study', 259)
(u'potential', 203)
(u'effective', 201)
(u'patients', 135)
(u'childbearing', 129)
(u'method', 128)
(u'least', 117)
(u'effective contraception', 138)
(u'use effective', 129)
(u'childbearing potential', 118)
(u'must use', 107)
(u'patients must', 93)
(u'method contraception', 83)
(u'Fertile patients', 82)
(u'adequate contraception', 72)
(u'must agree', 63)
(u'agree use', 59)
(u'use effective contraception', 101)
(u'must use effective', 89)
(u'patients must use', 84)
(u'Fertile patients must', 81)
(u'must agree use', 51)
(u'use adequate contraception', 37)
(u'childbearing potential must', 36)
(u'women childbearing potential', 30)
(u'PRIOR CONCURRENT THERAPY', 28)
(u'Women childbearing potential', 27)
(u'patients must use effective', 81)
(u'Fertile patients must use', 81)
(u'must use effective contraception', 80)
(u'PRIOR CONCURRENT THERAPY Biologic', 25)
(u'must agree use adequate', 24)
(u'CONCURRENT THERAPY Biologic therapy', 24)
(u'agree use adequate contraception', 23)
(u'contraception Fertile patients must', 18)
(u'contraception PRIOR CONCURRENT THERAPY', 15)
(u'use effective contraception PRIOR', 15)
For terms: Illicit drugs, Alcohol abuse, illegal, illicit
(u'illicit', 39)
(u'drug', 29)
(u'drugs', 28)
(u'alcohol', 27)
(u'use', 26)
(u'abuse', 20)
(u'within', 16)
(u'study', 12)
(u'months', 11)
(u'history', 10)
(u'illicit drug', 19)
(u'illicit drugs', 17)
(u'alcohol illicit', 11)
(u'drug use', 11)
(u'drug abuse', 9)
(u'prior first', 6)
(u'illegal drugs', 6)
(u'first dose', 5)
(u'days prior', 5)
(u'history alcohol', 5)
(u'illicit drug use', 10)
(u'alcohol illicit drug', 8)
(u'illicit drug abuse', 6)
(u'first dose study', 5)
(u'days prior first', 5)
(u'dose study medication', 5)
(u'use illicit drugs', 5)
(u'prior first dose', 5)
(u'illicit drugs alcohol', 4)
(u'drug abuse within', 3)
(u'prior first dose study', 5)
(u'first dose study medication', 5)
(u'days prior first dose', 4)
(u'alcohol illicit drug abuse', 4)
(u'illicit drug abuse within', 3)
(u'Current use illicit drugs', 3)
(u'alcohol illicit drug use', 3)
(u'illicit drug use within', 2)
(u'abuse drug addiction use', 2)
(u'90 days prior first', 2)
For terms: Congestive Heart Failure, heart failure
For terms: HIV, aids, human immunodeficiency virus
(u'HIV', 517)
(u'hepatitis', 163)
(u'infection', 146)
(u'B', 127)
(u'positive', 126)
(u'C', 121)
(u'virus', 118)
(u'immunodeficiency', 97)
(u'human', 84)
(u'Hepatitis', 73)
(u'HIV infection', 84)
(u'immunodeficiency virus', 81)
(u'virus HIV', 80)
(u'human immunodeficiency', 79)
(u'hepatitis B', 79)
(u'hepatitis C', 60)
(u'HIV positive', 56)
(u'Hepatitis B', 41)
(u'B C', 33)
(u'HIV hepatitis', 28)
(u'immunodeficiency virus HIV', 78)
(u'human immunodeficiency virus', 78)
(u'B hepatitis C', 25)
(u'hepatitis B hepatitis', 23)
(u'B surface antigen', 22)
(u'HIV hepatitis B', 22)
(u'hepatitis B C', 21)
(u'virus HIV infection', 17)
(u'HIV Hepatitis B', 16)
(u'Hepatitis B surface', 13)
(u'human immunodeficiency virus HIV', 75)
(u'hepatitis B hepatitis C', 23)
(u'immunodeficiency virus HIV infection', 17)
(u'immunodeficiency virus HIV positive', 13)
(u'hepatitis B surface antigen', 11)
(u'Hepatitis B surface antigen', 11)
(u'immunodeficiency virus HIV hepatitis', 9)
(u'positive human immunodeficiency virus', 9)
(u'B hepatitis C HIV', 9)
(u'Known human immunodeficiency virus', 8)
For terms: Allergies, allergy, hypersensitivity
(u'hypersensitivity', 331)
(u'allergy', 272)
(u'known', 157)
(u'Known', 150)
(u'history', 139)
(u'study', 112)
(u'History', 94)
(u'drug', 88)
(u'allergies', 76)
(u'drugs', 70)
(u'known hypersensitivity', 72)
(u'Known hypersensitivity', 65)
(u'Known allergy', 43)
(u'History hypersensitivity', 35)
(u'Patients known', 29)
(u'known allergy', 28)
(u'history allergy', 27)
(u'allergy hypersensitivity', 23)
(u'hypersensitivity reaction', 21)
(u'study drug', 20)
(u'Patients known hypersensitivity', 18)
(u'PRIOR CONCURRENT THERAPY', 14)
(u'CONCURRENT THERAPY Biologic', 11)
(u'THERAPY Biologic therapy', 11)
(u'history drug allergy', 10)
(u'Known suspected allergy', 8)
(u'history allergy hypersensitivity', 7)
(u'Known suspected hypersensitivity', 7)
(u'Patient known hypersensitivity', 7)
(u'known suspected allergy', 7)
(u'PRIOR CONCURRENT THERAPY Biologic', 11)
(u'CONCURRENT THERAPY Biologic therapy', 11)
(u'CHARACTERISTICS Age 18 Performance', 5)
(u'PATIENT CHARACTERISTICS Age 18', 5)
(u'drugs formulated polysorbate 80', 5)
(u'Age 18 Performance status', 5)
(u'times upper limit normal', 4)
(u'sensitivity study medications components', 4)
(u'least 100 000 mm3', 4)
(u'study medications components thereof', 4)

In [75]:
multiGrams = multinNgram(4, [[word[0] for word in sent] for sent in fertile_sents])
get_top_mulitgrams(multiGrams, fertile_terms, 10)


For terms: fertile
(u'contraception', 101)
(u'use', 100)
(u'patients', 95)
(u'effective', 92)
(u'must', 91)
(u'Fertile', 88)
(u'least', 63)
(u'therapy', 55)
(u'study', 53)
(u'prior', 47)
(u'use effective', 88)
(u'patients must', 84)
(u'must use', 84)
(u'Fertile patients', 82)
(u'effective contraception', 76)
(u'PRIOR CONCURRENT', 29)
(u'CONCURRENT THERAPY', 29)
(u'contraception Fertile', 29)
(u'Biologic therapy', 25)
(u'THERAPY Biologic', 25)
(u'patients must use', 83)
(u'must use effective', 82)
(u'Fertile patients must', 81)
(u'use effective contraception', 75)
(u'PRIOR CONCURRENT THERAPY', 29)
(u'contraception Fertile patients', 28)
(u'effective contraception Fertile', 26)
(u'CONCURRENT THERAPY Biologic', 25)
(u'THERAPY Biologic therapy', 24)
(u'contraception PRIOR CONCURRENT', 15)
(u'patients must use effective', 82)
(u'Fertile patients must use', 81)
(u'must use effective contraception', 74)
(u'contraception Fertile patients must', 28)
(u'use effective contraception Fertile', 26)
(u'effective contraception Fertile patients', 25)
(u'PRIOR CONCURRENT THERAPY Biologic', 25)
(u'CONCURRENT THERAPY Biologic therapy', 24)
(u'contraception PRIOR CONCURRENT THERAPY', 15)
(u'use effective contraception PRIOR', 15)

Look at full sentences


In [77]:
def check_sents(text):
    for sent in text:
        print ' '.join([word[0] for word in sent])

In [79]:
check_sents(fertile_sents[:10])


Fertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy
Fertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :
Fertile patients must use effective contraception during and for 4 weeks after study participation
Fertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :
Agreement to use a condom , and with a fertile female partner , another form of contraception .
DISEASE CHARACTERISTICS : Histologically proven epithelial adenocarcinoma of the ovary , fallopian tube , or peritoneum CA 125 greater than 35 U mL No conclusive radiological or clinical evidence of disease No disease recurrence Must have received only 1 prior platinum based chemotherapy regimen No tumors of low malignant potential or noninvasive disease PATIENT CHARACTERISTICS : Age : 18 and over Performance status : ECOG 0-2 Life expectancy : At least 6 months Hematopoietic : Hemoglobin at least 8 . 0 g dL Lymphocyte count at least 1 , 000 mm3 Neutrophil count at least 1 , 500 mm3 Platelet count at least 100 , 000 mm3 Hepatic : Bilirubin no greater than 1 . 5 times normal Renal : Creatinine no greater than 2 mg dL Cardiovascular : No uncontrolled hypertension No congestive heart failure No arrhythmias Other : Not pregnant or nursing Negative pregnancy test Fertile patients must use effective contraception No active autoimmune disease requiring chronic treatment No allergy to murine proteins No documented anaphylactic reaction to any drug No active infection causing fever No immunodeficiency disease No uncontrolled nonmalignant diseases No other malignancy ( except nonmelanomatous skin cancer or carcinoma in situ of the cervix ) unless curatively treated and free of disease for at least 5 years PRIOR CONCURRENT THERAPY : Biologic therapy : No prior murine monoclonal antibodies Chemotherapy : See Disease Characteristics At least 4 weeks since prior platinum based chemotherapy No concurrent chemotherapy Endocrine therapy : Not specified Radiotherapy : At least 6 months since prior limited field ( i . e ., abdominal or pelvic ) radiotherapy No prior whole abdominal radiotherapy Surgery : At least 4 weeks since prior surgery No prior splenectomy Other : At least 4 weeks since prior immunosuppressive drugs No concurrent immunosuppressive drugs At least 30 days since other prior investigational drugs
Patients who are fertile must agree to use an effective method of contraception during participation in the study
Fertile patients must use effective contraception PRIOR CONCURRENT THERAPY : Biologic therapy :
Fertile patients must use effective contraception
Fertile patients must use effective barrier contraception during and for 3 months after study

chosen categories

  • Non-smoker
  • Pregnancy
  • Birth control
  • Illicit drugs/Alcohol abuse
  • Congestive Heart Failure
  • HIV
  • Allergies/hypersensitivity

In [ ]: