In [7]:
#ETL of data frame, separation of training and testing data
# Importing libraries
import PySide
%pylab inline
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn import cross_validation
#from skll import kappa
from time import time
#Filter out warnings - comment out if debugging code
import warnings
warnings.filterwarnings("ignore")
# Percentage of data for test set
test_set_size = 0.4
# List of categorical, continuous and discrete
s = ["Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41",
"Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Employment_Info_4, Employment_Info_6, Insurance_History_5, Family_Hist_2, Family_Hist_3, Family_Hist_4, Family_Hist_5",
"Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32"]
varTypes = dict()
varTypes['categorical'] = s[0].split(', ')
varTypes['continuous'] = s[1].split(', ')
varTypes['discrete'] = s[2].split(', ')
varTypes['dummy'] = ["Medical_Keyword_"+str(i) for i in range(1,49)]
#Import training data
d_raw = pd.read_csv('prud_files/train.csv')
d = d_raw.copy()
# Get all the columns that have NaNs
a = pd.isnull(d).sum()
nullColumns = a[a>0].index.values
#Determine the min and max values for the NaN columns
a = pd.DataFrame(d, columns=nullColumns).describe()
# Convert all NaNs to -1 and sum up all medical keywords across columns
df = d.fillna(-1)
b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
df= pd.concat([df,b], axis=1, join='outer')
#Turn split train to test on or off.
#If on, 10% of the dataset is used for feature training
#If off, training set is loaded from file
splitTrainToTest = 1
if(splitTrainToTest):
d_gb = df.groupby("Response")
#Partial data set to train
df_train = pd.DataFrame()
#Partial data set to test
df_test = pd.DataFrame()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
for name, group in d_gb:
test_g = group[:len(group)*test_set_size]
train_g = group[len(group)*test_set_size:]
df_test = pd.concat([df_test, test_g], axis=0, join='outer')
df_train = pd.concat([df_train, train_g], axis=0, join='outer')
print "test data is 10% training data"
else:
d_test = pd.read_csv('prud_files/test.csv')
df_test = d_test.fillna(-1)
b = pd.DataFrame(df[varTypes["dummy"]].sum(axis=1), columns=["Medical_Keyword_Sum"])
df_test= pd.concat([df_test,b], axis=1, join='outer')
print "test data is prud_files/test.csv"
## Extract key columns for normalization
df_train_n = df_train.copy()
df_test_n = df_test.copy()
#Get all the Product Info 2 categories
a = pd.get_dummies(df["Product_Info_2"]).columns.tolist()
norm_PI2_dict = dict()
#Create an enumerated dictionary of Product Info 2 categories
i=1
for c in a:
norm_PI2_dict.update({c:i})
i+=1
df_train_n = df_train_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})
df_test_n = df_test_n.replace(to_replace={'Product_Info_2':norm_PI2_dict})
# normalizes a single dataframe column and returns the result
def normalize_df(d):
min_max_scaler = preprocessing.MinMaxScaler()
x = d.values.astype(np.float)
return min_max_scaler.fit_transform(x)
#Normalize relevant columns
df_train_n = df_train_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]
df_test_n = df_test_n[["Response"]+varTypes["categorical"]+varTypes["discrete"]]
for col in df_train_n:
df_train_n[col] = normalize_df(df_train_n[col])
for col in df_test_n:
df_test_n[col] = normalize_df(df_test_n[col])
#Combine cells together
df_train_n = pd.concat([pd.DataFrame(df_train.Id),df_train_n,df_train[varTypes['continuous']],pd.DataFrame(df_train.Medical_Keyword_Sum)], axis=1, join='outer')
df_test_n = pd.concat([pd.DataFrame(df_test.Id),df_test_n,df_test[varTypes['continuous']],pd.DataFrame(df_test.Medical_Keyword_Sum)], axis=1, join='outer')
print "Ready for ML"
In [8]:
import logging
from six import string_types
from six.moves import xrange as range
from sklearn.metrics import confusion_matrix, f1_score, SCORERS
### Imported from skll package. http://skll.readthedocs.org/en/latest/_modules/skll/metrics.html
def kappa(y_true, y_pred, weights=None, allow_off_by_one=False):
"""
Calculates the kappa inter-rater agreement between two the gold standard
and the predicted ratings. Potential values range from -1 (representing
complete disagreement) to 1 (representing complete agreement). A kappa
value of 0 is expected if all agreement is due to chance.
In the course of calculating kappa, all items in `y_true` and `y_pred` will
first be converted to floats and then rounded to integers.
It is assumed that y_true and y_pred contain the complete range of possible
ratings.
This function contains a combination of code from yorchopolis's kappa-stats
and Ben Hamner's Metrics projects on Github.
:param y_true: The true/actual/gold labels for the data.
:type y_true: array-like of float
:param y_pred: The predicted/observed labels for the data.
:type y_pred: array-like of float
:param weights: Specifies the weight matrix for the calculation.
Options are:
- None = unweighted-kappa
- 'quadratic' = quadratic-weighted kappa
- 'linear' = linear-weighted kappa
- two-dimensional numpy array = a custom matrix of
weights. Each weight corresponds to the
:math:`w_{ij}` values in the wikipedia description
of how to calculate weighted Cohen's kappa.
:type weights: str or numpy array
:param allow_off_by_one: If true, ratings that are off by one are counted as
equal, and all other differences are reduced by
one. For example, 1 and 2 will be considered to be
equal, whereas 1 and 3 will have a difference of 1
for when building the weights matrix.
:type allow_off_by_one: bool
"""
logger = logging.getLogger(__name__)
# Ensure that the lists are both the same length
assert(len(y_true) == len(y_pred))
# This rather crazy looking typecast is intended to work as follows:
# If an input is an int, the operations will have no effect.
# If it is a float, it will be rounded and then converted to an int
# because the ml_metrics package requires ints.
# If it is a str like "1", then it will be converted to a (rounded) int.
# If it is a str that can't be typecast, then the user is
# given a hopefully useful error message.
# Note: numpy and python 3.3 use bankers' rounding.
try:
y_true = [int(np.round(float(y))) for y in y_true]
y_pred = [int(np.round(float(y))) for y in y_pred]
except ValueError as e:
logger.error("For kappa, the labels should be integers or strings "
"that can be converted to ints (E.g., '4.0' or '3').")
raise e
# Figure out normalized expected values
min_rating = min(min(y_true), min(y_pred))
max_rating = max(max(y_true), max(y_pred))
# shift the values so that the lowest value is 0
# (to support scales that include negative values)
y_true = [y - min_rating for y in y_true]
y_pred = [y - min_rating for y in y_pred]
# Build the observed/confusion matrix
num_ratings = max_rating - min_rating + 1
observed = confusion_matrix(y_true, y_pred,
labels=list(range(num_ratings)))
num_scored_items = float(len(y_true))
# Build weight array if weren't passed one
if isinstance(weights, string_types):
wt_scheme = weights
weights = None
else:
wt_scheme = ''
if weights is None:
weights = np.empty((num_ratings, num_ratings))
for i in range(num_ratings):
for j in range(num_ratings):
diff = abs(i - j)
if allow_off_by_one and diff:
diff -= 1
if wt_scheme == 'linear':
weights[i, j] = diff
elif wt_scheme == 'quadratic':
weights[i, j] = diff ** 2
elif not wt_scheme: # unweighted
weights[i, j] = bool(diff)
else:
raise ValueError('Invalid weight scheme specified for '
'kappa: {}'.format(wt_scheme))
hist_true = np.bincount(y_true, minlength=num_ratings)
hist_true = hist_true[: num_ratings] / num_scored_items
hist_pred = np.bincount(y_pred, minlength=num_ratings)
hist_pred = hist_pred[: num_ratings] / num_scored_items
expected = np.outer(hist_true, hist_pred)
# Normalize observed array
observed = observed / num_scored_items
# If all weights are zero, that means no disagreements matter.
k = 1.0
if np.count_nonzero(weights):
k -= (sum(sum(weights * observed)) / sum(sum(weights * expected)))
return k
In [3]:
# Lasso CV
train_data = df_train_n.values.copy()
test_data = df_test_n.values.copy()
X_train = train_data[0:,2:]
Y_train = train_data[0:,1]
X_test = test_data[0:,2:]
Y_test = test_data[0:,1]
t0 = time()
clf = linear_model.LassoLarsCV()
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
mms = preprocessing.MinMaxScaler()
x = df["Response"].values.astype(np.float)
mms.fit_transform(x)
pred_transformed = mms.inverse_transform(pred)
Y_test_transformed = mms.inverse_transform(Y_test)
k = kappa(pred_transformed, Y_test_transformed, weights='quadratic')
In [4]:
params = clf.alpha_
print "The parameters are: ", params
print "Kappa is: ", k
In [5]:
'''
df_ak = pd.DataFrame(alpha_kappa,columns=["alpha","kappa"])
plt.figure(1, figsize=[10,10])
plt.subplot(211)
plt.title("alpha vs. kappa: linear lasso - test#1")
plt.xlabel("alpha[0.001,0.1]")
plt.ylabel("kappa")
plt.legend
plt.scatter(x=df_ak.alpha,y=df_ak.kappa)
plt.subplot(212)
plt.title("alpha vs. time: linear lasso - test#1")
plt.xlabel("alpha[0.001,0.1]")
plt.ylabel("time(s)")
plt.legend
plt.scatter(x=df_ak.alpha,y=df_ak.time)
#plt.savefig('images/scatterLassoCV_alpha_kappa_test1.png')
df_ak.describe()
'''
Out[5]:
In [ ]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)
train_data = df_train_n.values.copy()
test_data = df_test_n.values.copy()
X_train = train_data[0:,2:]
Y_train = train_data[0:,1]
X_test = test_data[0:,2:]
Y_test = test_data[0:,1]
Y_train = np.array(Y_train).astype(int)
est_kappa = list()
t0 = time()
clf = RandomForestClassifier(n_estimators = 350, )
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
mms = preprocessing.MinMaxScaler()
x = df["Response"].values.astype(np.float)
mms.fit_transform(x)
pred_transformed = mms.inverse_transform(pred)
Y_test_transformed = mms.inverse_transform(Y_test)
k = kappa(pred_transformed, Y_test_transformed, weights='quadratic')
print "The Kappa for Random Forest Text#3 is ", k
In [6]:
#Outputting file names in a folder
from os import walk
f = []
for (dirpath, dirnames, filenames) in walk(mypath):
f.extend(filenames)
break