In [1]:
    
# Importing libraries
import PySide
%pylab inline
%matplotlib inline
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
from sklearn import preprocessing
#from sklearn import linear_model
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import mean_squared_error
#from skll import kappa
from time import time
import warnings
warnings.filterwarnings("ignore")
import os
s = ["Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41",
    "Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5",
     "Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32"]
 
varTypes = dict()
varTypes['categorical'] = s[0].split(', ')
varTypes['continuous'] = s[1].split(', ')
varTypes['discrete'] = s[2].split(', ')
varTypes['dummy'] = ["Medical_Keyword_"+str(i) for i in range(1,49)]
#Import training data 
df = pd.read_csv('prud_files/train.csv')
'''Replacing NaNs with -1 and adding up Medical keywork column
# Get all the columns that have NaNs
a = pd.isnull(d).sum()
nullColumns = a[a>0].index.values
#Determine the min and max values for the NaN columns
a = pd.DataFrame(d, columns=nullColumns).describe()
# Convert all NaNs to -1 and sum up all medical keywords across columns
df = d.fillna(-1)
b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
df= pd.concat([df,b], axis=1, join='outer')
'''
# Sum and collapse medical_keyword1-48 features
b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
df= pd.concat([df,b], axis=1, join='outer')
## Extract key columns for normalization
df_n = df.copy()
# Retrieve all dummy vars from Product_Info_2 
a = pd.get_dummies(df["Product_Info_2"]).columns.tolist()
norm_PI2_dict = dict()
#Create an enumerated dictionary of Product Info 2 categories
i=1
for c in a:
    norm_PI2_dict.update({c:i})
    i+=1 
df_n = df_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})
df_n = df_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})
# normalizes a single dataframe column and returns the result
def normalize_df(d):
    min_max_scaler = preprocessing.MinMaxScaler()
    x = d.values.astype(np.float)
    return min_max_scaler.fit_transform(x)
df_n.Response = normalize_df(df_n.Response)
#Normalize relevant columns
'''
df_train_n = df_train_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]
df_test_n = df_test_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]
for col in df_train_n:
    df_train_n[col] = normalize_df(df_train_n[col])
for col in df_test_n:
    df_test_n[col] = normalize_df(df_test_n[col])
    
#Combine cells together
'''
#df_n is normalized, df is categorical
df_n = pd.concat(
        [pd.DataFrame(df.Id),
        pd.DataFrame(df['Response']),
        pd.DataFrame(df_n['Product_Info_2']),
        pd.DataFrame(df['Medical_Keyword_Sum']),
        pd.DataFrame(df[varTypes['continuous']])], 
        axis=1, 
        join='outer')
             
print "Ready for Modelling"
# Group by risk categories
print df_n.columns.tolist()
# Taking partial sample for faster processing
df_n_gb = df_n.groupby("Response")
'''
df_n_test = pd.DataFrame()
for name, group in df_n_gb:
    g = group[:len(group)/100]
    df_n_test = pd.concat([df_n_test, g], axis=0, join='outer')
'''
# Output data frame!!
    
    
    Out[1]:
In [145]:
    
# Plot the risk response 
from pandas.tools import plotting
plt.figure(0, figsize=[20,20])
# Ratings are horizontal
# Features on Y-axis
cat=  ['Response','Ht','BMI','Ins_Age','Product_Info_2','Medical_Keyword_Sum']
#plt.title(" Scatter Matrix"+str(name))
plotting.scatter_matrix(df_n[cat], figsize=(20,20))
directory = 'images/scatter_matrix/'
if not os.path.exists(directory):
    os.makedirs(directory)
file_string = directory+'Response_scatter_matrix_2016-03-06.png'
print ("")
plt.savefig(file_string)
plt.show()
    
    
    
    
In [147]:
    
i=0
g = df_n.groupby('Response')
#BMI, Ht, Ins_Age, Product_Info_2, Wt
'''
df_n_test = pd.DataFrame()
for name, group in df_n_gb:
    g = group[:len(group)/100]
    df_n_test = pd.concat([df_n_test, g], axis=0, join='outer')
    
g = df_n_test.groupby('Response')
'''
features =  ['Ht','BMI','Ins_Age','Product_Info_2','Medical_Keyword_Sum']
i=0
for cat in features:
    for name, group in g:
        plt.figure(i, figsize=[5,5])
        # Ratings are horizontal
        # Features on Y-axis
        plt.title(str(cat)+" Histogram for Risk Response Rating "+str(name))
        plt.hist(group[cat])
        plt.ylabel('Count')
        plt.xlabel(str(cat))
        directory = 'images/'+str(cat)+'/'
        if not os.path.exists(directory):
            os.makedirs(directory)
            
            
        file_string = directory+str(name)+'_hist_Response_'+str(cat)+'_-2016-03-06.png'
        print ("")
        
        #plt.savefig(file_string)
        plt.show()
        i+=1
    
    
In [ ]:
    
''' Tried playing with hexbin plots... TODO: Plot Response Rating vs different features
g = df_n_test.groupby('Response')
cat = 'Ht'
plt.figure(0, figsize=[5,5])
plt.title(str(cat)+" Hexbin plot for Risk Response Rating")
df_n_test.plot(kind='kde', x='Response', y=cat, gridsize=25)
plt.ylabel(cat)
plt.xlabel('Response')
#plt.savefig('images/'+str(name)+'_hist_Response-2016-03-06.png')
    
plt.show()
i+=1    
'''
'''
plt.figure(0)
plt.subplot(121)
plt.title("Categorical - Histogram for Risk Response")
plt.xlabel("Risk Response (1-7)")
plt.ylabel("Frequency")
plt.hist(df.Response)
plt.savefig('images/hist_Response.png')
print df.Response.describe()
print ""
'''