In [7]:
#ETL of data frame, separation of training and testing data

# Importing libraries

import PySide

%pylab inline
%matplotlib inline

import pandas as pd 

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import mean_squared_error

from sklearn import cross_validation


#from skll import kappa

from time import time


#Filter out warnings - comment out if debugging code
import warnings
warnings.filterwarnings("ignore")

# Percentage of data for test set
test_set_size = 0.4 

# List of categorical, continuous and discrete
s = ["Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41",
    "Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5",
     "Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32"]
 

varTypes = dict()


varTypes['categorical'] = s[0].split(', ')
varTypes['continuous'] = s[1].split(', ')
varTypes['discrete'] = s[2].split(', ')
varTypes['dummy'] = ["Medical_Keyword_"+str(i) for i in range(1,49)]


#Import training data 
d_raw = pd.read_csv('prud_files/train.csv')
d = d_raw.copy()


# Get all the columns that have NaNs
a = pd.isnull(d).sum()
nullColumns = a[a>0].index.values

#Determine the min and max values for the NaN columns
a = pd.DataFrame(d, columns=nullColumns).describe()

# Convert all NaNs to -1 and sum up all medical keywords across columns
df = d.fillna(-1)
b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
df= pd.concat([df,b], axis=1, join='outer')


#Turn split train to test on or off.  
#If on, 10% of the dataset is used for feature training
#If off, training set is loaded from file

splitTrainToTest = 1

if(splitTrainToTest):
    
    d_gb = df.groupby("Response")
    
    #Partial data set to train
    df_train = pd.DataFrame()
    
    #Partial data set to test
    df_test = pd.DataFrame()
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
    
    for name, group in d_gb:
        
        test_g = group[:len(group)*test_set_size]
        train_g = group[len(group)*test_set_size:]
        df_test = pd.concat([df_test, test_g], axis=0, join='outer')
        df_train = pd.concat([df_train, train_g], axis=0, join='outer')
        
    print "test data is 10% training data"
    
else:
    d_test = pd.read_csv('prud_files/test.csv')
    df_test = d_test.fillna(-1)
    b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
    df_test= pd.concat([df_test,b], axis=1, join='outer')
    print "test data is prud_files/test.csv"
    

## Extract key columns for normalization

df_train_n = df_train.copy()
df_test_n = df_test.copy()

#Get all the Product Info 2 categories

a = pd.get_dummies(df["Product_Info_2"]).columns.tolist()
norm_PI2_dict = dict()

#Create an enumerated dictionary of Product Info 2 categories

i=1
for c in a:
    norm_PI2_dict.update({c:i})
    i+=1 

df_train_n = df_train_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})
df_test_n = df_test_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})

# normalizes a single dataframe column and returns the result

def normalize_df(d):
    min_max_scaler = preprocessing.MinMaxScaler()
    x = d.values.astype(np.float)
    return min_max_scaler.fit_transform(x)


#Normalize relevant columns

df_train_n = df_train_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]
df_test_n = df_test_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]

for col in df_train_n:
    df_train_n[col] = normalize_df(df_train_n[col])
for col in df_test_n:
    df_test_n[col] = normalize_df(df_test_n[col])
#Combine cells together


df_train_n = pd.concat([pd.DataFrame(df_train.Id),df_train_n,df_train[varTypes['continuous']],pd.DataFrame(df_train.Medical_Keyword_Sum)], axis=1, join='outer')

df_test_n = pd.concat([pd.DataFrame(df_test.Id),df_test_n,df_test[varTypes['continuous']],pd.DataFrame(df_test.Medical_Keyword_Sum)], axis=1, join='outer')

print "Ready for ML"


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f', 'clf']
`%matplotlib` prevents importing * from pylab and numpy
test data is 10% training data
Ready for ML

In [8]:
import logging

from six import string_types
from six.moves import xrange as range

from sklearn.metrics import confusion_matrix, f1_score, SCORERS


### Imported from skll package.  http://skll.readthedocs.org/en/latest/_modules/skll/metrics.html

def kappa(y_true, y_pred, weights=None, allow_off_by_one=False):
    """
    Calculates the kappa inter-rater agreement between two the gold standard
    and the predicted ratings. Potential values range from -1 (representing
    complete disagreement) to 1 (representing complete agreement).  A kappa
    value of 0 is expected if all agreement is due to chance.

    In the course of calculating kappa, all items in `y_true` and `y_pred` will
    first be converted to floats and then rounded to integers.

    It is assumed that y_true and y_pred contain the complete range of possible
    ratings.

    This function contains a combination of code from yorchopolis's kappa-stats
    and Ben Hamner's Metrics projects on Github.

    :param y_true: The true/actual/gold labels for the data.
    :type y_true: array-like of float
    :param y_pred: The predicted/observed labels for the data.
    :type y_pred: array-like of float
    :param weights: Specifies the weight matrix for the calculation.
                    Options are:

                        -  None = unweighted-kappa
                        -  'quadratic' = quadratic-weighted kappa
                        -  'linear' = linear-weighted kappa
                        -  two-dimensional numpy array = a custom matrix of
                           weights. Each weight corresponds to the
                           :math:`w_{ij}` values in the wikipedia description
                           of how to calculate weighted Cohen's kappa.

    :type weights: str or numpy array
    :param allow_off_by_one: If true, ratings that are off by one are counted as
                             equal, and all other differences are reduced by
                             one. For example, 1 and 2 will be considered to be
                             equal, whereas 1 and 3 will have a difference of 1
                             for when building the weights matrix.
    :type allow_off_by_one: bool
    """
    logger = logging.getLogger(__name__)

    # Ensure that the lists are both the same length
    assert(len(y_true) == len(y_pred))

    # This rather crazy looking typecast is intended to work as follows:
    # If an input is an int, the operations will have no effect.
    # If it is a float, it will be rounded and then converted to an int
    # because the ml_metrics package requires ints.
    # If it is a str like "1", then it will be converted to a (rounded) int.
    # If it is a str that can't be typecast, then the user is
    # given a hopefully useful error message.
    # Note: numpy and python 3.3 use bankers' rounding.
    try:
        y_true = [int(np.round(float(y))) for y in y_true]
        y_pred = [int(np.round(float(y))) for y in y_pred]
    except ValueError as e:
        logger.error("For kappa, the labels should be integers or strings "
                     "that can be converted to ints (E.g., '4.0' or '3').")
        raise e

    # Figure out normalized expected values
    min_rating = min(min(y_true), min(y_pred))
    max_rating = max(max(y_true), max(y_pred))

    # shift the values so that the lowest value is 0
    # (to support scales that include negative values)
    y_true = [y - min_rating for y in y_true]
    y_pred = [y - min_rating for y in y_pred]

    # Build the observed/confusion matrix
    num_ratings = max_rating - min_rating + 1
    observed = confusion_matrix(y_true, y_pred,
                                labels=list(range(num_ratings)))
    num_scored_items = float(len(y_true))

    # Build weight array if weren't passed one
    if isinstance(weights, string_types):
        wt_scheme = weights
        weights = None
    else:
        wt_scheme = ''
    if weights is None:
        weights = np.empty((num_ratings, num_ratings))
        for i in range(num_ratings):
            for j in range(num_ratings):
                diff = abs(i - j)
                if allow_off_by_one and diff:
                    diff -= 1
                if wt_scheme == 'linear':
                    weights[i, j] = diff
                elif wt_scheme == 'quadratic':
                    weights[i, j] = diff ** 2
                elif not wt_scheme:  # unweighted
                    weights[i, j] = bool(diff)
                else:
                    raise ValueError('Invalid weight scheme specified for '
                                     'kappa: {}'.format(wt_scheme))

    hist_true = np.bincount(y_true, minlength=num_ratings)
    hist_true = hist_true[: num_ratings] / num_scored_items
    hist_pred = np.bincount(y_pred, minlength=num_ratings)
    hist_pred = hist_pred[: num_ratings] / num_scored_items
    expected = np.outer(hist_true, hist_pred)

    # Normalize observed array
    observed = observed / num_scored_items

    # If all weights are zero, that means no disagreements matter.
    k = 1.0
    if np.count_nonzero(weights):
        k -= (sum(sum(weights * observed)) / sum(sum(weights * expected)))

    return k

In [3]:
# Lasso CV 

train_data = df_train_n.values.copy()
test_data = df_test_n.values.copy()

X_train = train_data[0:,2:]
Y_train = train_data[0:,1]

X_test = test_data[0:,2:]
Y_test = test_data[0:,1]

t0 = time()

clf = linear_model.LassoLarsCV()
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)

mms = preprocessing.MinMaxScaler()
x = df["Response"].values.astype(np.float)
mms.fit_transform(x)

pred_transformed = mms.inverse_transform(pred)
Y_test_transformed = mms.inverse_transform(Y_test)

k = kappa(pred_transformed, Y_test_transformed, weights='quadratic')

In [4]:
params = clf.alpha_

print "The parameters are: ", params
print "Kappa is: ", k


The parameters are:  1.04342989855e-07
Kappa is:  0.358159226563

In [5]:
'''
df_ak = pd.DataFrame(alpha_kappa,columns=["alpha","kappa"])   

plt.figure(1, figsize=[10,10])
plt.subplot(211)
plt.title("alpha vs. kappa: linear lasso - test#1")
plt.xlabel("alpha[0.001,0.1]")
plt.ylabel("kappa")
plt.legend
plt.scatter(x=df_ak.alpha,y=df_ak.kappa)

plt.subplot(212)
plt.title("alpha vs. time: linear lasso - test#1")
plt.xlabel("alpha[0.001,0.1]")
plt.ylabel("time(s)")
plt.legend
plt.scatter(x=df_ak.alpha,y=df_ak.time)

#plt.savefig('images/scatterLassoCV_alpha_kappa_test1.png')
    
df_ak.describe()

'''


Out[5]:
'\ndf_ak = pd.DataFrame(alpha_kappa,columns=["alpha","kappa"])   \n\nplt.figure(1, figsize=[10,10])\nplt.subplot(211)\nplt.title("alpha vs. kappa: linear lasso - test#1")\nplt.xlabel("alpha[0.001,0.1]")\nplt.ylabel("kappa")\nplt.legend\nplt.scatter(x=df_ak.alpha,y=df_ak.kappa)\n\nplt.subplot(212)\nplt.title("alpha vs. time: linear lasso - test#1")\nplt.xlabel("alpha[0.001,0.1]")\nplt.ylabel("time(s)")\nplt.legend\nplt.scatter(x=df_ak.alpha,y=df_ak.time)\n\n#plt.savefig(\'images/scatterLassoCV_alpha_kappa_test1.png\')\n    \ndf_ak.describe()\n\n'

Lasso model - test #1

train_data = df_train_n.values.copy() test_data = df_test_n.values.copy() X_train = train_data[0:,2:] Y_train = train_data[0:,1] X_test = test_data[0:,2:] Y_test = test_data[0:,1] alpha_kappa = list() for i in range(1,100,2): a = float(i)/1000 t0 = time() clf = linear_model.Lasso(alpha=a) clf.fit(X_train, Y_train) pred = clf.predict(X_test) mms = preprocessing.MinMaxScaler() x = df["Response"].values.astype(np.float) mms.fit_transform(x) pred_transformed = mms.inverse_transform(pred) Y_test_transformed = mms.inverse_transform(Y_test) k = kappa(pred_transformed, Y_test_transformed, weights='quadratic') alpha_kappa+=[[a,k,round(time()-t0,3)]] df_ak = pd.DataFrame(alpha_kappa,columns=["alpha","kappa","time"]) plt.figure(1, figsize=[10,10]) plt.subplot(211) plt.title("alpha vs. kappa: linear lasso - test#1") plt.xlabel("alpha[0.001,0.1]") plt.ylabel("kappa") plt.legend plt.scatter(x=df_ak.alpha,y=df_ak.kappa) plt.subplot(212) plt.title("alpha vs. time: linear lasso - test#1") plt.xlabel("alpha[0.001,0.1]") plt.ylabel("time(s)") plt.legend plt.scatter(x=df_ak.alpha,y=df_ak.time) plt.savefig('images/scatterLasso_alpha_kappa_test1.png') df_ak.describe()

Lasso - Test #2

train_data = df_train_n.values.copy() test_data = df_test_n.values.copy() X_train = train_data[0:,2:] Y_train = train_data[0:,1] X_test = test_data[0:,2:] Y_test = test_data[0:,1] alpha_kappa = list() for i in range(1,100,2): a = float(i)/10000 t0 = time() clf = linear_model.Lasso(alpha=a) clf.fit(X_train, Y_train) pred = clf.predict(X_test) mms = preprocessing.MinMaxScaler() x = df["Response"].values.astype(np.float) mms.fit_transform(x) pred_transformed = mms.inverse_transform(pred) Y_test_transformed = mms.inverse_transform(Y_test) k = kappa(pred_transformed, Y_test_transformed, weights='quadratic') alpha_kappa+=[[a,k, round(time()-t0,3)]] df_ak = pd.DataFrame(alpha_kappa,columns=["alpha","kappa", "time"]) plt.figure(2, figsize=[10,10]) plt.subplot(211) plt.title("alpha vs. kappa: linear lasso: test2") plt.xlabel("alpha[0.0001,0.01]") plt.ylabel("kappa") plt.legend plt.scatter(x=df_ak.alpha,y=df_ak.kappa) plt.subplot(212) plt.title("alpha vs. time: linear lasso - test#2") plt.xlabel("alpha[0.0001,0.01]") plt.ylabel("time(s)") plt.legend plt.scatter(x=df_ak.alpha,y=df_ak.time) plt.savefig('images/scatterLasso_alpha_kappa_test2.png') df_ak.describe()

Random forest - test # 1

train_data = df_train_n.values.copy() test_data = df_test_n.values.copy() X_train = train_data[0:,2:] Y_train = train_data[0:,1] X_test = test_data[0:,2:] Y_test = test_data[0:,1] Y_train = np.array(Y_train).astype(int) est_kappa = list() for i in range(1,100,10): t0 = time() clf = RandomForestClassifier(n_estimators = i) clf.fit(X_train, Y_train) pred = clf.predict(X_test) mms = preprocessing.MinMaxScaler() x = df["Response"].values.astype(np.float) mms.fit_transform(x) pred_transformed = mms.inverse_transform(pred) Y_test_transformed = mms.inverse_transform(Y_test) k = kappa(pred_transformed, Y_test_transformed, weights='quadratic') est_kappa+=[[i,k,round(time()-t0,3)]] df_ek = pd.DataFrame(est_kappa,columns=["est","kappa","time"]) plt.figure(3, figsize=[10,10]) plt.subplot(211) plt.title("Estimators vs. kappa: RandomForest: Test1") plt.xlabel("Estimators [1,100,10]") plt.ylabel("kappa") plt.legend plt.scatter(x=df_ek.est,y=df_ek.kappa) plt.subplot(212) plt.title("Time vs. kappa: RandomForest: Test1") plt.xlabel("Estimators [1,100,10]") plt.ylabel("Time(s)") plt.legend plt.scatter(x=df_ek.est,y=df_ek.time) plt.savefig('images/RFC_scatter_alpha_kappa_test1.png') df_ek.describe()

Random forest - test # 2

train_data = df_train_n.values.copy() test_data = df_test_n.values.copy() X_train = train_data[0:,2:] Y_train = train_data[0:,1] X_test = test_data[0:,2:] Y_test = test_data[0:,1] Y_train = np.array(Y_train).astype(int) est_kappa = list() for i in range(100,1000,100): t0 = time() clf = RandomForestClassifier(n_estimators = i) clf.fit(X_train, Y_train) pred = clf.predict(X_test) mms = preprocessing.MinMaxScaler() x = df["Response"].values.astype(np.float) mms.fit_transform(x) pred_transformed = mms.inverse_transform(pred) Y_test_transformed = mms.inverse_transform(Y_test) k = kappa(pred_transformed, Y_test_transformed, weights='quadratic') est_kappa+=[[i,k,round(time()-t0,3)]] df_ek = pd.DataFrame(est_kappa,columns=["est","kappa","time"]) plt.figure(4, figsize=[10,10]) plt.subplot(211) plt.title("Estimators vs. kappa: RandomForest: Test2") plt.xlabel("Estimators [100,1000,100]") plt.ylabel("kappa") plt.legend plt.scatter(x=df_ek.est,y=df_ek.kappa) plt.subplot(212) plt.title("Time vs. kappa: RandomForest: Test1") plt.xlabel("Estimators [100,1000,100]") plt.ylabel("Time(s)") plt.legend plt.scatter(x=df_ek.est,y=df_ek.time) plt.savefig('images/RFC_scatter_alpha_kappa_test2.png') from sklearn import cross_validationdf_ek.describe()

In [ ]:
from sklearn import cross_validation

X_train, X_test, y_train, y_test = cross_validation.train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)


train_data = df_train_n.values.copy()
test_data = df_test_n.values.copy()

X_train = train_data[0:,2:]
Y_train = train_data[0:,1]

X_test = test_data[0:,2:]
Y_test = test_data[0:,1]

Y_train = np.array(Y_train).astype(int)

est_kappa = list()

t0 = time()
    
clf = RandomForestClassifier(n_estimators = 350, )
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)

mms = preprocessing.MinMaxScaler()
x = df["Response"].values.astype(np.float)
mms.fit_transform(x)

pred_transformed = mms.inverse_transform(pred)
Y_test_transformed = mms.inverse_transform(Y_test)

k = kappa(pred_transformed, Y_test_transformed, weights='quadratic')

print "The Kappa for Random Forest Text#3 is ", k

In [6]:
#Outputting file names in a folder

from os import walk

f = []
for (dirpath, dirnames, filenames) in walk(mypath):
    f.extend(filenames)
    break


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-17901daae3c7> in <module>()
      4 
      5 f = []
----> 6 for (dirpath, dirnames, filenames) in walk(mypath):
      7     f.extend(filenames)
      8     break

NameError: name 'mypath' is not defined