In [1]:
import pandas as pd
import numpy as np
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
trn = pd.read_csv("../input/train_clean.csv")
target = pd.read_csv("../input/train.csv", usecols=["target"])
tst = pd.read_csv("../input/test_clean.csv")
test_id = tst["ncodpers"]
tst.drop(["ncodpers"], axis=1, inplace=True)
trn.drop(["ncodpers"], axis=1, inplace=True)

print(trn.shape, target.shape, tst.shape)


(45619, 22) (45619, 1) (929615, 22)

Train 데이터와 Test 데이터의 컬럼이 동일해야 하므로 확인을 해봐야 합니다 :)


In [3]:
trn.columns == tst.columns


Out[3]:
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True], dtype=bool)

Scikit-learn의 경우 수치형 데이터가 아니면 들어가질 않기 때문에

수치형이 아닌 친구를 확인해봐요


In [5]:
for col in trn.columns:
    if trn[col].dtype == "object":
        print(col)


fecha_dato
ind_empleado
pais_residencia
sexo
fecha_alta
ult_fec_cli_1t
tiprel_1mes
indresi
indext
conyuemp
canal_entrada
indfall
nomprov
segmento

In [6]:
for col in trn.columns:
    if trn[col].dtype == "object":
        lb = LabelEncoder()
        lb.fit(pd.concat([trn[col], tst[col]])) # 테스트와 트레인 데이터를 아래로 합침
        trn[col] = lb.transform(trn[col]) # 컬럼을 덮어씌움
        tst[col] = lb.transform(tst[col])

In [7]:
# 이제 변수 타입들 다시 확인
for col in trn.columns:
    print(col, trn[col].dtype, tst[col].dtype)


fecha_dato int64 int64
ind_empleado int64 int64
pais_residencia int64 int64
sexo int64 int64
age int64 int64
fecha_alta int64 int64
ind_nuevo int64 int64
antiguedad int64 int64
indrel int64 int64
ult_fec_cli_1t int64 int64
indrel_1mes int64 int64
tiprel_1mes int64 int64
indresi int64 int64
indext int64 int64
conyuemp int64 int64
canal_entrada int64 int64
indfall int64 int64
cod_prov int64 int64
nomprov int64 int64
ind_actividad_cliente int64 int64
renta float64 float64
segmento int64 int64

In [9]:
# target을 이제 확인해봐요

for t in np.unique(target):
    print(t, sum(target["target"]==t))


2 9452
3 9
4 1934
5 55
6 349
7 222
8 154
9 503
10 33
11 1085
12 1219
13 246
14 4
15 21
16 8
17 2942
18 4733
19 159
20 3
21 5151
22 8218
23 9119

In [10]:
# 빈도가 적은 데이터 제거
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes
trn = trn[target["target"].isin(rem_targets)]
target = target[target["target"].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)


C:\Users\Byeon\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:129: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [11]:
def evaluate(x, y, model):
    trn_scores = dict(); vld_scores = dict()
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777) # 10% 테스트, random_state 는 시드값
    for t_ind, v_ind in sss.split(x,y):
        # split data
        x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
        y_trn, y_vld = y[t_ind], y[v_ind]

        # fit model
        model.fit(x_trn, y_trn) # train만 학습시킵니다!
        
        # eval _ trn
        preds = model.predict(x_trn)
        acc_scores = trn_scores.get('accuracy', [])
        acc_scores.append(accuracy_score(y_trn, preds))
        trn_scores['accuracy'] = acc_scores

        f1_scores = trn_scores.get('f1 score', [])
        f1_scores.append(f1_score(y_trn, preds, average='weighted'))
        trn_scores['f1 score'] = f1_scores
        
        preds = model.predict_proba(x_trn)

        log_scores = trn_scores.get('log loss', [])
        log_scores.append(log_loss(y_trn, preds))
        trn_scores['log loss'] = log_scores

        # eval _ vld
        preds = model.predict(x_vld) # predice된 값을 y_vld랑 비교
        acc_scores = vld_scores.get('accuracy', [])
        acc_scores.append(accuracy_score(y_vld, preds))
        vld_scores['accuracy'] = acc_scores

        f1_scores = vld_scores.get('f1 score', [])
        f1_scores.append(f1_score(y_vld, preds, average='weighted'))
        vld_scores['f1 score'] = f1_scores
        
        preds = model.predict_proba(x_vld)

        log_scores = vld_scores.get('log loss', [])
        log_scores.append(log_loss(y_vld, preds))
        vld_scores['log loss'] = log_scores
    return trn_scores, vld_scores

def print_scores(trn_scores, vld_scores):
    prefix = '        '
    cols = ['accuracy', 'f1 score','log loss']
    print('='*50)
    print('TRAIN EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))

    print('='*50)
    print('VALID EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))

def print_time(end, start):
    print('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    
def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores(trn_scores, vld_scores)
    print_time(time.time(), st)

In [12]:
st = time.time()
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(n_jobs=-1, random_state=777) #n_jobs= -1 인 경우 자싱니 가진 모든 cpu를 사용함
fit_and_eval(trn, target, model)
# 58 sec


C:\Users\Byeon\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
==================================================
TRAIN EVAL
--------------------------------------------------
# accuracy
#          Mean : 0.2689817635351935
#          Raw  : [0.26660168149140978, 0.27308395272328501, 0.26725965639088584]
--------------------------------------------------
# f1 score
#          Mean : 0.1935636190100405
#          Raw  : [0.18744667752682029, 0.20518539950478837, 0.18805877999851281]
--------------------------------------------------
# log loss
#          Mean : 2.0239253340972945
#          Raw  : [2.0252648324052274, 2.0237462642254758, 2.0227649056611798]
==================================================
VALID EVAL
--------------------------------------------------
# accuracy
#          Mean : 0.26622807017543865
#          Raw  : [0.26337719298245615, 0.27543859649122809, 0.25986842105263158]
--------------------------------------------------
# f1 score
#          Mean : 0.19096039754864078
#          Raw  : [0.18323018001326635, 0.20681280442409578, 0.18283820820856014]
--------------------------------------------------
# log loss
#          Mean : 2.0253845878611636
#          Raw  : [2.0192749821789748, 2.0252751868334657, 2.0316035945710502]
==================================================
65 secs

훈련 데이터와 검증 데이터의 평가 척도가 비슷한지 확인을 해야합니다-!

너무 다른 경우 오버피팅의 가능성이 존재해요

로지스틱 regression에서 C의 값은 regularization의 역수임

지금 과적합 상태라면 c의 값을 낮추면 정규성이 증가됨~!!

모델은 복잡도 기준으로 바라볼 것

sroted Feature importance 기준으로 볼 경우 변수의 scale이 다른데 그냥 한번에 돌림..! normalize를 하고 다시 돌려볼 것


In [13]:
# Utility

def observe_model_lr(model):
    target_num = 0
    print('='*50)
    print(model)
    
    print('='*50)
    print('# Coefficients for target_num == {}'.format(target_num))
    print(model.coef_[target_num])
    
    print('-'*50)
    print('# Mapped to Column Name')
    prefix = '    '
    coefs = dict()
    for i, coef in enumerate(model.coef_[target_num]):
        print('{} {} \t {}'.format(prefix, round(coef,5), trn.columns[i]))
        coefs[trn.columns[i]] = np.absolute(coef)

    print('-'*50)
    print('# Sorted Feature Importance')
    coefs_sorted = sorted(coefs.items(), key=operator.itemgetter(1), reverse=True)
    for item in coefs_sorted:
        print('{} {} \t {}'.format(prefix, round(item[1],5), item[0]))
    
    return coefs_sorted

def plot_coef(coef):
    x = []; y = []
    for item in coef:
        x.append(item[0])
        y.append(item[1])

    f, ax = plt.subplots(figsize=(20, 15))
    sns.barplot(x,y,alpha=0.5)
    ax.set_title('Feature Importance for Model : Logistic Regression')
    ax.set(xlabel='Column Name', ylabel='Feature Importance')

In [14]:
# 모델 상세 보기
coef = observe_model_lr(model)


==================================================
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=777, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
==================================================
# Coefficients for target_num == 0
[  0.00000000e+00  -1.44479305e-04  -1.72440483e-03  -6.62831797e-05
  -4.00738642e-03  -1.37587306e-04   9.43799153e-05  -9.75453910e-03
  -4.57973205e-05   1.71631976e-05  -4.77794835e-05   5.31567487e-05
  -9.51667325e-05   1.64720342e-05  -4.82183632e-05   2.95537254e-03
   8.31537519e-07  -1.12444979e-03  -1.08733896e-03  -1.07982690e-04
  -1.84225687e-07   4.04364051e-06]
--------------------------------------------------
# Mapped to Column Name
     0.0 	 fecha_dato
     -0.00014 	 ind_empleado
     -0.00172 	 pais_residencia
     -7e-05 	 sexo
     -0.00401 	 age
     -0.00014 	 fecha_alta
     9e-05 	 ind_nuevo
     -0.00975 	 antiguedad
     -5e-05 	 indrel
     2e-05 	 ult_fec_cli_1t
     -5e-05 	 indrel_1mes
     5e-05 	 tiprel_1mes
     -0.0001 	 indresi
     2e-05 	 indext
     -5e-05 	 conyuemp
     0.00296 	 canal_entrada
     0.0 	 indfall
     -0.00112 	 cod_prov
     -0.00109 	 nomprov
     -0.00011 	 ind_actividad_cliente
     -0.0 	 renta
     0.0 	 segmento
--------------------------------------------------
# Sorted Feature Importance
     0.00975 	 antiguedad
     0.00401 	 age
     0.00296 	 canal_entrada
     0.00172 	 pais_residencia
     0.00112 	 cod_prov
     0.00109 	 nomprov
     0.00014 	 ind_empleado
     0.00014 	 fecha_alta
     0.00011 	 ind_actividad_cliente
     0.0001 	 indresi
     9e-05 	 ind_nuevo
     7e-05 	 sexo
     5e-05 	 tiprel_1mes
     5e-05 	 conyuemp
     5e-05 	 indrel_1mes
     5e-05 	 indrel
     2e-05 	 ult_fec_cli_1t
     2e-05 	 indext
     0.0 	 segmento
     0.0 	 indfall
     0.0 	 renta
     0.0 	 fecha_dato

캐글 결과물 출력을 위한 코드


In [ ]:
from datetime import datetime
import os

print('='*50)
print('# Test shape : {}'.format(tst.shape))

model = LogisticRegression(n_jobs=-1, random_state=777)
model.fit(trn,target)

preds = model.predict_proba(tst)
preds = np.fliplr(np.argsort(preds, axis=1)) # 왼쪽이 제일 큰 값이 나오게 설정

In [ ]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
target_cols = [cols[i] for i, col in enumerate(cols) if i in rem_targets]

In [ ]:
final_preds = []
for pred in preds:
    top_products = []
    for i, product in enumerate(pred):
        top_products.append(target_cols[product])
        if i == 6:
            break
    final_preds.append(' '.join(top_products))

out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('../output',file_name), index=False)