In [1]:
from pandas.io.sql import read_frame
import pandas as pd
import numpy as np
import MySQLdb
import matplotlib.pyplot as plt
In [2]:
# does dataset.pickle exist?
have_pickle = !ls dataset.pickle 2>/dev/null
if len(have_pickle) > 0:
dataset = pd.read_pickle('dataset.pickle')
else:
# sql connection parameters come from connect.py
have_connect = !ls connect.py 2>/dev/null
if len(have_connect) == 0:
!mv ../../connect.py .
from connect import *
mysql_cn= MySQLdb.connect(host=mysqlserver,
port=3306,user=mysqlusername, passwd=mysqlpassword,
db=mysqldbname)
dataset = read_frame("select * from datamining_table", mysql_cn, 'nct_id')
dataset.to_pickle('dataset.pickle')
Getting summary stats
In [3]:
dataset.describe().transpose()
Out[3]:
Capping some continuous variables
In [4]:
%matplotlib inline
to_cap = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]
for c in to_cap:
p = plt.figure()
p.set_figwidth(10)
dataset[c].hist(bins=100)
p.suptitle(c)
p.show()
In [11]:
cap_length = {
'study_length_yrs': 10,
'number_of_arms': 4,
'number_of_groups': 1,
'total_officials': 2,
'total_sponsors': 3,
'total_facilities': 2,
'facilities_countries': 2,
'facilities_states': 2,
'total_conditions': 11,
'total_conditions_main': 5,
'total_conditions_top': 2
}
for c in cap_length.keys():
maxval = cap_length[c]
dataset[c] = d2[c].map(lambda x: maxval if x > maxval else x)
Normalizing and replacing missing with the mean (0)
In [37]:
dataset_norm = dataset.copy()
dataset_norm_withingroup = dataset.copy()
to_normalize = [
'study_length_yrs',
'number_of_arms',
'number_of_groups',
'minimum_age_days',
'maximum_age_days',
'total_officials',
'total_sponsors',
'total_facilities',
'facilities_countries',
'facilities_states',
'total_conditions',
'total_conditions_main',
'total_conditions_top'
]
zscore = lambda x: (x - x.mean()) / x.std()
def normalize(df,colname,grouped=False):
if grouped:
df[c] = df[c].groupby(df['target']).transform(zscore)
else:
df[c] = df[c].groupby(df['target'].isnull()).transform(zscore)
df[c] = df[c].map(lambda x: 0 if np.isnan(x) else x)
for c in to_normalize:
normalize(dataset_norm, c, False)
normalize(dataset_norm_withingroup, c, True)
In [40]:
# initialize variables
def init_vars(df):
df['training'] = 1.0
df['sample'] = 1.0
init_vars(dataset_norm)
init_vars(dataset_norm_withingroup)
In [48]:
# procedures to create test group and sample of target for balanced training set
def update_training(df):
df['training'] = df['training'].map(lambda x: 0.0 if np.random.rand() > 0.8 else 1.0)
def update_sample(df):
def transformvar(df):
if df['target'] == 1 and np.random.rand() > 0.15293:
return 0.0
else:
return 1.0
df['sample'] = df.apply(transformvar, axis=1)
In [58]:
update_training(dataset_norm)
update_sample(dataset_norm)
update_training(dataset_norm_withingroup)
update_sample(dataset_norm_withingroup)
In [61]:
dataset_norm.to_pickle('dataset_norm.pickle')
dataset_norm_withingroup.to_pickle('dataset_norm_withingroup.pickle')
In [ ]: