In [1]:
    
import pandas as pd
import numpy as np
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
    
In [2]:
    
trn = pd.read_csv("../input/train_clean.csv")
target = pd.read_csv("../input/train.csv", usecols=["target"])
tst = pd.read_csv("../input/test_clean.csv")
test_id = tst["ncodpers"]
tst.drop(["ncodpers"], axis=1, inplace=True)
trn.drop(["ncodpers"], axis=1, inplace=True)
print(trn.shape, target.shape, tst.shape)
    
    
In [3]:
    
print(trn.info(), target.info(), tst.info())
    
    
In [4]:
    
trn.columns == tst.columns
    
    Out[4]:
In [5]:
    
for col in trn.columns:
    if trn[col].dtype == "object":
        print(col)
    
    
In [6]:
    
for col in trn.columns:
    if trn[col].dtype == "object":
        lb = LabelEncoder()
        lb.fit(pd.concat([trn[col], tst[col]]))
        trn[col] = lb.transform(trn[col])
        tst[col] = lb.transform(tst[col])
    
In [7]:
    
for col in trn.columns:
    print(col, trn[col].dtype, tst[col].dtype)
    
    
In [8]:
    
for t in np.unique(target):
    print(t, sum(target["target"]==t))
    
    
In [9]:
    
# 빈도가 적은 데이터 제거
    
In [10]:
    
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes
trn = trn[target["target"].isin(rem_targets)]
target = target[target["target"].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)
    
    
In [13]:
    
# target["target"].isin
    
In [14]:
    
def evaluate(x, y, model):
    trn_scores = dict(); vld_scores = dict()
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777)
    for t_ind, v_ind in sss.split(x,y):
        # split data
        x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
        y_trn, y_vld = y[t_ind], y[v_ind]
        # fit model
        model.fit(x_trn, y_trn)
        
        # eval _ trn
        preds = model.predict(x_trn)
        acc_scores = trn_scores.get('accuracy', [])
        acc_scores.append(accuracy_score(y_trn, preds))
        trn_scores['accuracy'] = acc_scores
        f1_scores = trn_scores.get('f1 score', [])
        f1_scores.append(f1_score(y_trn, preds, average='weighted'))
        trn_scores['f1 score'] = f1_scores
        
        preds = model.predict_proba(x_trn)
        log_scores = trn_scores.get('log loss', [])
        log_scores.append(log_loss(y_trn, preds))
        trn_scores['log loss'] = log_scores
        # eval _ vld
        preds = model.predict(x_vld)
        acc_scores = vld_scores.get('accuracy', [])
        acc_scores.append(accuracy_score(y_vld, preds))
        vld_scores['accuracy'] = acc_scores
        f1_scores = vld_scores.get('f1 score', [])
        f1_scores.append(f1_score(y_vld, preds, average='weighted'))
        vld_scores['f1 score'] = f1_scores
        
        preds = model.predict_proba(x_vld)
        log_scores = vld_scores.get('log loss', [])
        log_scores.append(log_loss(y_vld, preds))
        vld_scores['log loss'] = log_scores
    return trn_scores, vld_scores
def print_scores(trn_scores, vld_scores):
    prefix = '        '
    cols = ['accuracy', 'f1 score','log loss']
    print('='*50)
    print('TRAIN EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))
    print('='*50)
    print('VALID EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))
def print_time(end, start):
    print('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    
def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores(trn_scores, vld_scores)
    print_time(time.time(), st)
    
In [15]:
    
st = time.time()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs=-1, random_state=777)
fit_and_eval(trn, target, model)
# 58 sec
    
    
    
In [25]:
    
# Utility
def observe_model_lr(model):
    target_num = 0
    print('='*50)
    print(model)
    
    print('='*50)
    print('# Coefficients for target_num == {}'.format(target_num))
    print(model.coef_[target_num])
    
    print('-'*50)
    print('# Mapped to Column Name')
    prefix = '    '
    coefs = dict()
    for i, coef in enumerate(model.coef_[target_num]):
        print('{} {} \t {}'.format(prefix, round(coef,5), trn.columns[i]))
        coefs[trn.columns[i]] = np.absolute(coef)
    print('-'*50)
    print('# Sorted Feature Importance')
    coefs_sorted = sorted(coefs.items(), key=operator.itemgetter(1), reverse=True)
    for item in coefs_sorted:
        print('{} {} \t {}'.format(prefix, round(item[1],5), item[0]))
    
    return coefs_sorted
def plot_coef(coef):
    x = []; y = []
    for item in coef:
        x.append(item[0])
        y.append(item[1])
    f, ax = plt.subplots(figsize=(20, 15))
    sns.barplot(x,y,alpha=0.5)
    ax.set_title('Feature Importance for Model : Logistic Regression')
    ax.set(xlabel='Column Name', ylabel='Feature Importance')
    
In [26]:
    
# 모델 상세 보기
coef = observe_model_lr(model)
    
    
In [27]:
    
# 주요 변수 시각화
plot_coef(coef)
    
    
In [29]:
    
trn.head()
    
    Out[29]:
In [30]:
    
trn["age"] = (trn["age"]/10).astype(int)
    
In [33]:
    
tst["age"] = (tst["age"]/10).astype(int)
    
In [ ]:
    
# indresi, conyuemp, ult_fec_cli_1t 빈도수가 적으니 제거