In [ ]:
import sqlite3
import os
import sys
print(sqlite3.version)

In [ ]:
import platform

print(platform.python_version())
print(platform.python_version_tuple())

In [ ]:
sqlitedb = os.path.join(os.path.expanduser('~'),'Box Sync', 'GradSchoolStuff', 'MastersProject', 'ctpa.sqlite')
if not (os.path.exists(sqlitedb)):
    print("Specified database does not exist")
    sys.exit()

In [ ]:
# try and catch - everything closed at the end
try:
    connection = sqlite3.connect(sqlitedb)
    cursor = connection.cursor()
    cursor.execute('select sqlite_version()')
    data = cursor.fetchone()
    print(data)
except:
    print('Error: ', sys.exc_info()[0])
    raise
finally:
    if connection:
        connection.close()

In [ ]:
# with is supposed to close automatically if there is an error
connection = sqlite3.connect(sqlitedb)
with connection:
    cursor = connection.cursor()
    cursor.execute('select sqlite_version()')
    data = cursor.fetchone()
    print(data)

In [ ]:
with connection:
    cursor = connection.cursor()
    cursor.execute('select * from sqlite_master')
    row = cursor.fetchone()
    while row:
        print(row)
        row = cursor.fetchone()

In [ ]:
# PRAGMA allows you to get metadata about a table
with connection:
    cur = connection.cursor()    
    cur.execute('PRAGMA table_info(sqlite_master)')
    data = cur.fetchall()
    for d in data:
        print(d)

In [ ]:
with connection:
    cursor = connection.cursor()
    cursor.execute('select * from sqlite_master')
    rows = cursor.fetchall()
    for row in rows:
        print('-----------------')
        print('type: ', row[0])
        print('name: ', row[1])
        print('tbl_name: ', row[2])
        print('rootpage: ', row[3])
        print('sql: ', row[4])
        #for col in row:
        #    print(col, end=" ")
        #print('')

In [ ]:
with connection:
    cur = connection.cursor()
    cur.execute('select count(*) from reports')
    data = cur.fetchall()
    print('Total rows in REPORTS table: ', data[0][0])

In [78]:
with connection:
    cur = connection.cursor()
    cur.execute('select * from reports limit 5')
    col_names = [cn[0] for cn in cur.description]
    rows = cur.fetchall()
    #print(len(rows[0]))
    #print("%s %s %s %s %s %s" % (col_names[0], col_names[1], col_names[2], col_names[3], col_names[4], col_names[5]))

    for row in rows:
        for i,col in enumerate(row):
            print('++++++', col_names[i], ': ', col)
        print('----------------- END ROW -----------------')


++++++ id :  0
++++++ hbid :  iDio0nDdKR8z
++++++ reportid :  1,iDio0nDdKR8z
++++++ reportType :  RAD
++++++ report :  

CT CHEST WITH CONTRAST:  10/26/06  1820 HOURS 
HISTORY:  SHORTNESS OF BREATH.  RULE OUT PE.   
TECHNIQUE:  Axial images of the chest were obtained following 
intravenous administration of 125cc of Optiray 320 utilizing 
pulmonary embolism protocol. 
FINDINGS:  The study is limited due to suboptimal opacification of 
the pulmonary arteries, especially for the evaluation of distal 
segmental pulmonary arteries.  There is no gross evidence of 
significant pulmonary embolism.  Small emboli in the distal segmental 
pulmonary arteries cannot be totally excluded. 
Cardiomegaly is present along with pulmonary vascular congestion.  
Prominent pulmonary artery consistent with pulmonary arterial 
hypertension is also seen.  A small right pleural effusion with 
adjacent atelectasis is present.  There is no evidence of a 
pneumothorax.  The aorta is of normal caliber.   
In the visualized abdomen, there is evidence of ascites, streaking of 
subcutaneous fat is seen consistent with anasarca. 
IMPRESSION: 
1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT 
PULMONARY EMBOLISM. 
2. CARDIOMEGALY, PULMONARY VASCULAR CONGESTION AND PULMONARY ARTERIAL 
HYPERTENSION PRESENT. 
3. SMALL RIGHT EFFUSION WITH ADJACENT ATELECTASIS.   
O27 
END OF IMPRESSION: 
  
   
  
  




++++++ impression :  IMPRESSION: 
1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT 
PULMONARY EMBOLISM. 
2. CARDIOMEGALY, PULMONARY VASCULAR CONGESTION AND PULMONARY ARTERIAL 
HYPERTENSION PRESENT. 
3. SMALL RIGHT EFFUSION WITH ADJACENT ATELECTASIS.   
O27 
END OF IMPRESSION: 
----------------- END ROW -----------------
++++++ id :  1
++++++ hbid :  flsC6rf5kjGz
++++++ reportid :  2,flsC6rf5kjGz
++++++ reportType :  RAD
++++++ report :  


CTA CHEST:  10/26/2006 8:00 AM 
HISTORY:  HISTORY OF ENDOMETRIAL CANCER.  SHORTNESS OF BREATH. 
TECHNIQUE:  Helical images in 1.25mm collimation were obtained from 
the lung bases to the apices after the uneventful administration of 
150cc of intravenous Optiray-320. 
FINDINGS:  There are no filling defects to suggest pulmonary 
embolism.  There are dependent bibasilar opacities compatible with 
atelectasis.  There is some scarring at the right apex.  The lungs 
are otherwise clear. 
IMPRESSION: 
1.  NO FILLING DEFECTS TO SUGGEST PULMONARY EMBOLISM. 
2.  DEPENDENT CONSOLIDATION COMPATIBLE WITH ATELECTASIS. 
O26 
END OF IMPRESSION: 




++++++ impression :  IMPRESSION: 
1.  NO FILLING DEFECTS TO SUGGEST PULMONARY EMBOLISM. 
2.  DEPENDENT CONSOLIDATION COMPATIBLE WITH ATELECTASIS. 
O26 
END OF IMPRESSION: 
----------------- END ROW -----------------
++++++ id :  2
++++++ hbid :  SDEkX9mlfuwT
++++++ reportid :  3,SDEkX9mlfuwT
++++++ reportType :  RAD
++++++ report :  


EXAMINATION PERFORMED:
CT THORAX  WITH CONTRAST   05/10/08     0940 HOURS

CLINICAL HISTORY:   
Chest pain.

COMPARISON:   
4/9/07.

TECHNIQUE:   
CTA of the chest with nonionic intravenous contrast as per the PE
protocol.

FINDINGS:   
There is no evidence of a pulmonary embolism.

The lungs demonstrate no evidence of focal consolidation. There is
a small ground-glass subpleural nodule, image 110 in the right
upper lobe.This measures 6mm. Central airway is patent.

There are small mediastinal nodes which are unchanged. There is no
hilar adenopathy. There are no pleural effusions. There is no
pericardial effusion.

Exam of the upper abdomen demonstrates no definite abnormality.

There is atelectasis at the left base and lingula.

Visualized bones demonstrate no focal lesions.

IMPRESSION:   
1. NO EVIDENCE OF A PULMONARY EMBOLISM.
2. GROUND-GLASS SUBCENTIMETER NODULE IN THE RIGHT UPPER LOBE. THIS
IS NONSPECIFIC AND 3 MONTH INTERVAL FOLLOW-UP IS SUGGESTED.

END OF IMPRESSION:




++++++ impression :  IMPRESSION:   
1. NO EVIDENCE OF A PULMONARY EMBOLISM.
2. GROUND-GLASS SUBCENTIMETER NODULE IN THE RIGHT UPPER LOBE. THIS
IS NONSPECIFIC AND 3 MONTH INTERVAL FOLLOW-UP IS SUGGESTED.

END OF IMPRESSION:
----------------- END ROW -----------------
++++++ id :  3
++++++ hbid :  m2ia9ilU38Ip
++++++ reportid :  4,m2ia9ilU38Ip
++++++ reportType :  RAD
++++++ report :  


EXAMINATION PERFORMED:
CT ANGIOGRAPHY CHEST WITH CONTRAST   12/03/07     1614 HOURS

CLINICAL HISTORY:   
Unresponsive.

TECHNIQUE:   
CT of the chest with nonionic intravenous contrast as per the PE
protocol.

FINDINGS:   
There is no evidence of a pulmonary embolism.

The lungs demonstrate small bilateral pleural effusions with
atelectasis. There is a focal irregular nodule in the right upper
lobe anteriorly, image 153. This measures 7x6mm. There is no
definite evidence of calcification. There is also a subpleural
nodular density image 100 in the left upper lobe, however, there
is streak artifact from the pacer, which limits evaluation.
Central airway is patent.

There is no evidence of thoracic adenopathy. There is no
pericardial effusion. There is cardiomegaly. There is coronary
artery calcification.

Exam of the upper abdomen demonstrates slight hypertrophy of the
lateral segment which is partially imaged. There is slight
infiltration of the visualized mesentery.

The visualized bones demonstrate no focal lesions. There are
degenerative changes of the spine.

Main pulmonary artery is slightly dilated.

IMPRESSION:   
1. NO EVIDENCE OF A PULMONARY EMBOLISM.
2. CARDIOMEGALY WITH SMALL BILATERAL PLEURAL EFFUSIONS AND SLIGHT
SEPTAL THICKENING WHICH MAY INDICATE PULMONARY EDEMA.
3. FOCAL NODULE IN THE RIGHT UPPER LOBE AND SUBPLEURAL NODULE IN
THE LEFT UPPER LOBE WHICH ARE NONSPECIFIC AND AMENABLE TO INTERVAL
FOLLOW UP.
4. SLIGHT ENLARGEMENT OF THE MAIN PULMONARY ARTERY WHICH MAY
INDICATE PULMONARY ARTERIAL HYPERTENSION.
5. SLIGHT HYPERTROPHY OF THE LATERAL SEGMENT OF THE LIVER, WHICH
MAY INDICATE UNDERLYING CIRRHOSIS. DEDICATED LIVER IMAGING COULD
BE OBTAINED AFTER THE PATIENT RESOLVES THIS ACUTE PROCESS.

END OF IMPRESSION:




++++++ impression :  IMPRESSION:   
1. NO EVIDENCE OF A PULMONARY EMBOLISM.
2. CARDIOMEGALY WITH SMALL BILATERAL PLEURAL EFFUSIONS AND SLIGHT
SEPTAL THICKENING WHICH MAY INDICATE PULMONARY EDEMA.
3. FOCAL NODULE IN THE RIGHT UPPER LOBE AND SUBPLEURAL NODULE IN
THE LEFT UPPER LOBE WHICH ARE NONSPECIFIC AND AMENABLE TO INTERVAL
FOLLOW UP.
4. SLIGHT ENLARGEMENT OF THE MAIN PULMONARY ARTERY WHICH MAY
INDICATE PULMONARY ARTERIAL HYPERTENSION.
5. SLIGHT HYPERTROPHY OF THE LATERAL SEGMENT OF THE LIVER, WHICH
MAY INDICATE UNDERLYING CIRRHOSIS. DEDICATED LIVER IMAGING COULD
BE OBTAINED AFTER THE PATIENT RESOLVES THIS ACUTE PROCESS.

END OF IMPRESSION:
----------------- END ROW -----------------
++++++ id :  4
++++++ hbid :  XTw2PqVh7BWC
++++++ reportid :  5,XTw2PqVh7BWC
++++++ reportType :  RAD
++++++ report :  


CT CHEST PE PROTOCOL:  12/27/06  1:23 AM
HISTORY:   DYSPNEA, PE PROTOCOL. 
COMPARISON:  No prior comparison. 
TECHNIQUE:  Post IV contrast 0.6mm helical images were obtained from 
lung base through thoracic inlet with coronal reformation. 
FINDINGS:  
Opacification of the pulmonary arteries reveal no pulmonary embolism. 
Cardiomegaly with minimal bilateral pleural effusions, right greater 
than left are present. 
The anatomy of the chest is distorted by the patient's marked 
thoracic kyphosis.  
A compression fracture is observed at the T3 level with additional 
wedge compression fractures present throughout the remainder of the 
thoracolumbar spine as seen on the lateral chest radiograph from 
yesterday.  Additional sagittal reconstructed images of the thoracic 
spine  performed better demonstrate these compression deformities. 
The patient's severe thoracic kyphosis likely contributes to the 
patient's dyspnea.   T
here is associated aortic ectasia although no significant aneurysm or 
dissection is observed. 
Associated dependent subsegmental atelectasis is present.  No 
endotracheal lesions. 
IMPRESSION:
1.  NO PULMONARY EMBOLISM. 
2.  MARKED THORACIC KYPHOSIS WITH MULTIPLE ANTERIOR WEDGE COMPRESSION 
DEFORMITIES, AGE INDETERMINATE WITH ASSOCIATED CARDIOMEGALY, AORTIC 
ECTASIA AND DEPENDENT SUBSEGMENTAL ATELECTASIS. 
D27
END OF IMPRESSION:




++++++ impression :  IMPRESSION:
1.  NO PULMONARY EMBOLISM. 
2.  MARKED THORACIC KYPHOSIS WITH MULTIPLE ANTERIOR WEDGE COMPRESSION 
DEFORMITIES, AGE INDETERMINATE WITH ASSOCIATED CARDIOMEGALY, AORTIC 
ECTASIA AND DEPENDENT SUBSEGMENTAL ATELECTASIS. 
D27
END OF IMPRESSION:
----------------- END ROW -----------------

In [ ]:
report_col = 4
print(rows[0][report_col])

In [86]:
from nltk.util import ngrams
from nltk.probability import FreqDist
import re

sentences = ''
for row in rows:
    sentences += row[report_col]
    
ts = nltk.sent_tokenize(sentences)

#words that should be ignored as they are part of the report structure:
myregexp = '(EXAMINATION PERFORMED:|HISTORY:|CLINICAL HISTORY:|COMPARISON:|TECHNIQUE:|FINDINGS:|IMPRESSION:|END OF IMPRESSION:)'
new_ts = []
for s in ts:
    # covert to lower case to find all matches regardless of case
    s = re.sub(myregexp, '', s)
    s = s.lower()
    new_ts.append(s)

for s in new_ts:
    print('Sentence: ', s)


Sentence:  

ct chest with contrast:  10/26/06  1820 hours 
  shortness of breath.
Sentence:  rule out pe.
Sentence:    axial images of the chest were obtained following 
intravenous administration of 125cc of optiray 320 utilizing 
pulmonary embolism protocol.
Sentence:    the study is limited due to suboptimal opacification of 
the pulmonary arteries, especially for the evaluation of distal 
segmental pulmonary arteries.
Sentence:  there is no gross evidence of 
significant pulmonary embolism.
Sentence:  small emboli in the distal segmental 
pulmonary arteries cannot be totally excluded.
Sentence:  cardiomegaly is present along with pulmonary vascular congestion.
Sentence:  prominent pulmonary artery consistent with pulmonary arterial 
hypertension is also seen.
Sentence:  a small right pleural effusion with 
adjacent atelectasis is present.
Sentence:  there is no evidence of a 
pneumothorax.
Sentence:  the aorta is of normal caliber.
Sentence:  in the visualized abdomen, there is evidence of ascites, streaking of 
subcutaneous fat is seen consistent with anasarca.
Sentence:   
1.
Sentence:  limited study demonstrating no gross evidence of significant 
pulmonary embolism.
Sentence:  2.
Sentence:  cardiomegaly, pulmonary vascular congestion and pulmonary arterial 
hypertension present.
Sentence:  3.
Sentence:  small right effusion with adjacent atelectasis.
Sentence:  o27 
 
  
   
  
  






cta chest:  10/26/2006 8:00 am 
  history of endometrial cancer.
Sentence:  shortness of breath.
Sentence:    helical images in 1.25mm collimation were obtained from 
the lung bases to the apices after the uneventful administration of 
150cc of intravenous optiray-320.
Sentence:    there are no filling defects to suggest pulmonary 
embolism.
Sentence:  there are dependent bibasilar opacities compatible with 
atelectasis.
Sentence:  there is some scarring at the right apex.
Sentence:  the lungs 
are otherwise clear.
Sentence:   
1.
Sentence:  no filling defects to suggest pulmonary embolism.
Sentence:  2.
Sentence:  dependent consolidation compatible with atelectasis.
Sentence:  o26 
 







ct thorax  with contrast   05/10/08     0940 hours

   
chest pain.
Sentence:     
4/9/07.
Sentence:     
cta of the chest with nonionic intravenous contrast as per the pe
protocol.
Sentence:     
there is no evidence of a pulmonary embolism.
Sentence:  the lungs demonstrate no evidence of focal consolidation.
Sentence:  there is
a small ground-glass subpleural nodule, image 110 in the right
upper lobe.this measures 6mm.
Sentence:  central airway is patent.
Sentence:  there are small mediastinal nodes which are unchanged.
Sentence:  there is no
hilar adenopathy.
Sentence:  there are no pleural effusions.
Sentence:  there is no
pericardial effusion.
Sentence:  exam of the upper abdomen demonstrates no definite abnormality.
Sentence:  there is atelectasis at the left base and lingula.
Sentence:  visualized bones demonstrate no focal lesions.
Sentence:     
1.
Sentence:  no evidence of a pulmonary embolism.
Sentence:  2.
Sentence:  ground-glass subcentimeter nodule in the right upper lobe.
Sentence:  this
is nonspecific and 3 month interval follow-up is suggested.
Sentence:  







ct angiography chest with contrast   12/03/07     1614 hours

   
unresponsive.
Sentence:     
ct of the chest with nonionic intravenous contrast as per the pe
protocol.
Sentence:     
there is no evidence of a pulmonary embolism.
Sentence:  the lungs demonstrate small bilateral pleural effusions with
atelectasis.
Sentence:  there is a focal irregular nodule in the right upper
lobe anteriorly, image 153.
Sentence:  this measures 7x6mm.
Sentence:  there is no
definite evidence of calcification.
Sentence:  there is also a subpleural
nodular density image 100 in the left upper lobe, however, there
is streak artifact from the pacer, which limits evaluation.
Sentence:  central airway is patent.
Sentence:  there is no evidence of thoracic adenopathy.
Sentence:  there is no
pericardial effusion.
Sentence:  there is cardiomegaly.
Sentence:  there is coronary
artery calcification.
Sentence:  exam of the upper abdomen demonstrates slight hypertrophy of the
lateral segment which is partially imaged.
Sentence:  there is slight
infiltration of the visualized mesentery.
Sentence:  the visualized bones demonstrate no focal lesions.
Sentence:  there are
degenerative changes of the spine.
Sentence:  main pulmonary artery is slightly dilated.
Sentence:     
1.
Sentence:  no evidence of a pulmonary embolism.
Sentence:  2.
Sentence:  cardiomegaly with small bilateral pleural effusions and slight
septal thickening which may indicate pulmonary edema.
Sentence:  3.
Sentence:  focal nodule in the right upper lobe and subpleural nodule in
the left upper lobe which are nonspecific and amenable to interval
follow up.
Sentence:  4.
Sentence:  slight enlargement of the main pulmonary artery which may
indicate pulmonary arterial hypertension.
Sentence:  5.
Sentence:  slight hypertrophy of the lateral segment of the liver, which
may indicate underlying cirrhosis.
Sentence:  dedicated liver imaging could
be obtained after the patient resolves this acute process.
Sentence:  






ct chest pe protocol:  12/27/06  1:23 am
   dyspnea, pe protocol.
Sentence:    no prior comparison.
Sentence:    post iv contrast 0.6mm helical images were obtained from 
lung base through thoracic inlet with coronal reformation.
Sentence:    
opacification of the pulmonary arteries reveal no pulmonary embolism.
Sentence:  cardiomegaly with minimal bilateral pleural effusions, right greater 
than left are present.
Sentence:  the anatomy of the chest is distorted by the patient's marked 
thoracic kyphosis.
Sentence:  a compression fracture is observed at the t3 level with additional 
wedge compression fractures present throughout the remainder of the 
thoracolumbar spine as seen on the lateral chest radiograph from 
yesterday.
Sentence:  additional sagittal reconstructed images of the thoracic 
spine  performed better demonstrate these compression deformities.
Sentence:  the patient's severe thoracic kyphosis likely contributes to the 
patient's dyspnea.
Sentence:  t
here is associated aortic ectasia although no significant aneurysm or 
dissection is observed.
Sentence:  associated dependent subsegmental atelectasis is present.
Sentence:  no 
endotracheal lesions.
Sentence:  
1.
Sentence:  no pulmonary embolism.
Sentence:  2.
Sentence:  marked thoracic kyphosis with multiple anterior wedge compression 
deformities, age indeterminate with associated cardiomegaly, aortic 
ectasia and dependent subsegmental atelectasis.
Sentence:  d27






In [87]:
n = 4

# this is a generator object. Once iterate through it, it is empty
grams = ngrams(new_ts.split(), n)

fdist = FreqDist(grams)
#for ng, count in fdist.items():
#    print(ng, count)
    
for ng,count in fdist.most_common(50):
    print('N-Gram:', ng, 'Count: ', count)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-87-813ee8f2070e> in <module>()
      2 
      3 # this is a generator object. Once iterate through it, it is empty
----> 4 grams = ngrams(new_ts.split(), n)
      5 
      6 fdist = FreqDist(grams)

AttributeError: 'list' object has no attribute 'split'

In [ ]: