In [9]:
import re
import time
import os
import sqlite3
import pandas as pd
import numpy as np
In [10]:
class MasterSeer(object):
# database file name on disk
DB_NAME = 'seer.db'
def __init__(self, path = r'./data/', reload = True, verbose = True):
if type(path) != str:
raise TypeError('path must be a string')
if path[-1] != '/':
path += '/' # if path does not end with a backslash, add one
self.path = path
# List to hold lists of [Column Offset, Column Name, Column Length]
self.dataDictInfo = []
self.db_conn = None
self.db_cur = None
def __del__(self):
self.db_conn.close()
In [11]:
def init_database(self, reload):
try:
if reload:
os.remove(self.path + self.DB_NAME)
except:
pass
try:
#initialize database
self.db_conn = sqlite3.connect(self.path + self.DB_NAME)
self.db_cur = self.db_conn.cursor()
if self.verbose:
print('Database initialized')
return self.db_conn, self.db_cur
except Exception as e:
print('ERROR connecting to the database: ')
return None, None
In [12]:
def load_data_dictionary(self, fname = r'SeerDataDict.txt'):
REGEX_DD = '\$char([0-9]+).'
t0 = time.perf_counter()
if self.verbose:
print('\nStart Load of Data Dictionary')
# read our custom tab delimited data dictionary
df = pd.read_csv(self.path + fname, delimiter='\t')
# drop all rows where IMPORT_0_1 is a zero.
df = df[df.IMPORT_0_1 > 0]
# pre-compile regex to improve performance in loop
reCompiled = re.compile(REGEX_DD)
flen = [] # list to hold parsed field lengths
# add length column
for row in df.TYPE:
fields = reCompiled.match(row)
if fields:
x = int(fields.groups()[0])
flen.append(x)
# check to make sure we read the correct amount of field lengths
if len(flen) != len(df):
print('ERROR reading field lengths')
return None
# add length column to dataframe
df['LENGTH'] = flen
if self.verbose:
print('Data Dictionary loaded in {0:5.4f} sec.'.format(time.perf_counter() - t0),
flush=True)
return df
In [13]:
def load_data(self, source='breast', col=[], cond="YR_BRTH > 0", sample_size=5000,
all=False):
if col:
col = ','.join(map(str, col))
else:
col = "*"
if all:
limit = ""
randomize = ""
else:
limit = "LIMIT " + str(sample_size)
randomize = "ORDER BY RANDOM()"
df = pd.read_sql_query
("SELECT {0} FROM {1} WHERE {2} {3} {4}".format(col, source, cond,
randomize, limit), self.db_conn)
return df
In [14]:
def clean_recode_data(self, df, dependent_cutoffs):
# drop all rows that have invalid or missing data
try:
df = df.dropna(subset = ['YR_BRTH']) # add column names here as needed
except Exception as err:
pass
try:
df.LATERAL = df.LATERAL.replace([0, 1,2,3], 1) # one site = 1
df.LATERAL = df.LATERAL.replace([4,5,9], 2) # paired = 2
except:
pass
try:
df = df[df.O_DTH_CLASS == 0]
except:
pass
try:
# 0-benign, 1-borderline, 2-in situ, 3-malignant
df = df[df.BEHANAL != 5]
df.BEHANAL = df.BEHANAL.replace([3,4,6], 3)
except:
pass
try:
df = df[df.HST_STGA != 8]
df = df[df.HST_STGA != 9]
except:
pass
try:
# 0-negative, 1-borderline,, 2-positive
df = df[df.ERSTATUS != 4]
df = df[df.ERSTATUS != 9]
df.ERSTATUS = df.ERSTATUS.replace(2, 0)
df.ERSTATUS = df.ERSTATUS.replace(1, 2)
df.ERSTATUS = df.ERSTATUS.replace(3, 1)
except:
pass
try:
# 0-negative, 1-borderline,, 2-positive
df = df[df.PRSTATUS != 4]
df = df[df.PRSTATUS != 9]
df.PRSTATUS = df.PRSTATUS.replace(2, 0)
df.PRSTATUS = df.PRSTATUS.replace(1, 2)
df.PRSTATUS = df.PRSTATUS.replace(3, 1)
except:
pass
try:
df.RADIATN = df.RADIATN.replace(7, 0)
df.RADIATN = df.RADIATN.replace([2,3,4,5], 1)
df = df[df.RADIATN < 7]
except Exception as err:
pass
try:
# code as 1 or 2-more than one
df.NUMPRIMS = df.NUMPRIMS.replace([x for x in range(2,37)], 2)
except Exception as err:
pass
# Example dependent_cutoffs=[60,120,500]
# if survival is less than 60 SRV_BUCKET is set to 0
# if survival is >=60 and < 120 SRV_BUCKET is set to 1
if len(dependent_cutoffs) > 0:
# create new column of all NaN
df['SRV_BUCKET'] = np.NaN
# fill buckets
last_cut = 0
for x, cut in enumerate(dependent_cutoffs):
df.loc[(df.SRV_TIME_MON >= last_cut) &
(df.SRV_TIME_MON < cut), 'SRV_BUCKET'] = x
last_cut = cut
# assign all values larger than last cutoff to next bucket number
df['SRV_BUCKET'].fillna(len(dependent_cutoffs), inplace=True)
dep_col = 'SRV_BUCKET'
df = df.drop('SRV_TIME_MON', 1)
else:
dep_col = 'SRV_TIME_MON'
# categorical columns to one hot encode, check to make sure they are in df
#cat_cols_to_encode = list(set(['RACE', 'ORIGIN', 'SEX', 'TUMOR_2V', 'HISTREC'])
# & set(df.columns))
#df = self.one_hot_data(df, cat_cols_to_encode)
df['CENSORED'] = df.STAT_REC == 4
df = df.drop('STAT_REC', 1)
df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)
exc = pd.ExcelWriter('clean.xlsx')
df.to_excel(exc)
exc.save()
return df, dep_col
In [15]:
def one_hot_data(self, data, cols):
# check to only encode columns that are in the data
col_to_process = [c for c in cols if c in data]
return pd.get_dummies(data, columns = col_to_process, prefix = col_to_process)