In [1]:
    
from pandas.io.sql import read_frame
import pandas as pd
import numpy as np
import MySQLdb
import matplotlib.pyplot as plt
    
In [2]:
    
# does dataset.pickle exist?
have_pickle = !ls dataset.pickle 2>/dev/null
if len(have_pickle) > 0:
    dataset = pd.read_pickle('dataset.pickle')
else:
    # sql connection parameters come from connect.py
    have_connect = !ls connect.py 2>/dev/null
    if len(have_connect) == 0:
        !mv ../../connect.py .
    from connect import *
    mysql_cn= MySQLdb.connect(host=mysqlserver, 
                    port=3306,user=mysqlusername, passwd=mysqlpassword, 
                    db=mysqldbname)
    dataset = read_frame("select * from datamining_table", mysql_cn, 'nct_id')
    dataset.to_pickle('dataset.pickle')
    
Getting summary stats
In [3]:
    
dataset.describe().transpose()
    
    Out[3]:
Capping some continuous variables
In [4]:
    
%matplotlib inline
to_cap = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]
for c in to_cap:
    p = plt.figure()
    p.set_figwidth(10)
    dataset[c].hist(bins=100)
    p.suptitle(c)
    p.show()
    
    
    
    
    
    
    
    
    
    
    
    
    
In [11]:
    
cap_length = {
'study_length_yrs': 10,
'number_of_arms': 4,
'number_of_groups': 1,
'total_officials': 2,
'total_sponsors': 3,
'total_facilities': 2,
'facilities_countries': 2,
'facilities_states': 2,
'total_conditions': 11,
'total_conditions_main': 5,
'total_conditions_top': 2
}
for c in cap_length.keys():
    maxval = cap_length[c]
    dataset[c] = d2[c].map(lambda x: maxval if x > maxval else x)
    
Normalizing and replacing missing with the mean (0)
In [37]:
    
dataset_norm = dataset.copy()
dataset_norm_withingroup = dataset.copy()
to_normalize = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'minimum_age_days',
'maximum_age_days',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]
zscore = lambda x: (x - x.mean()) / x.std()
def normalize(df,colname,grouped=False):
    if grouped:
        df[c] = df[c].groupby(df['target']).transform(zscore)
    else:
        df[c] = df[c].groupby(df['target'].isnull()).transform(zscore)
    df[c] = df[c].map(lambda x: 0 if np.isnan(x) else x)
for c in to_normalize:
    normalize(dataset_norm, c, False)
    normalize(dataset_norm_withingroup, c, True)
    
In [40]:
    
# initialize variables
def init_vars(df):
    df['training'] = 1.0
    df['sample'] = 1.0
init_vars(dataset_norm)
init_vars(dataset_norm_withingroup)
    
In [48]:
    
# procedures to create test group and sample of target for balanced training set
def update_training(df):
    df['training'] = df['training'].map(lambda x: 0.0 if np.random.rand() > 0.8 else 1.0)
def update_sample(df):
    def transformvar(df):
        if df['target'] == 1 and np.random.rand() > 0.15293:
            return 0.0
        else:
            return 1.0
    df['sample'] = df.apply(transformvar, axis=1)
    
In [58]:
    
update_training(dataset_norm)
update_sample(dataset_norm)
update_training(dataset_norm_withingroup)
update_sample(dataset_norm_withingroup)
    
In [61]:
    
dataset_norm.to_pickle('dataset_norm.pickle')
dataset_norm_withingroup.to_pickle('dataset_norm_withingroup.pickle')
    
In [ ]: