In [1]:
import pandas as pd
import numpy as np
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
trn = pd.read_csv("../input/train_clean.csv")
target = pd.read_csv("../input/train.csv", usecols=["target"])
tst = pd.read_csv("../input/test_clean.csv")
test_id = tst["ncodpers"]
tst.drop(["ncodpers"], axis=1, inplace=True)
trn.drop(["ncodpers"], axis=1, inplace=True)
print(trn.shape, target.shape, tst.shape)
Train 데이터와 Test 데이터의 컬럼이 동일해야 하므로 확인을 해봐야 합니다 :)
In [3]:
trn.columns == tst.columns
Out[3]:
Scikit-learn의 경우 수치형 데이터가 아니면 들어가질 않기 때문에
수치형이 아닌 친구를 확인해봐요
In [5]:
for col in trn.columns:
if trn[col].dtype == "object":
print(col)
In [6]:
for col in trn.columns:
if trn[col].dtype == "object":
lb = LabelEncoder()
lb.fit(pd.concat([trn[col], tst[col]])) # 테스트와 트레인 데이터를 아래로 합침
trn[col] = lb.transform(trn[col]) # 컬럼을 덮어씌움
tst[col] = lb.transform(tst[col])
In [7]:
# 이제 변수 타입들 다시 확인
for col in trn.columns:
print(col, trn[col].dtype, tst[col].dtype)
In [9]:
# target을 이제 확인해봐요
for t in np.unique(target):
print(t, sum(target["target"]==t))
In [10]:
# 빈도가 적은 데이터 제거
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23] # 18 classes
trn = trn[target["target"].isin(rem_targets)]
target = target[target["target"].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)
In [11]:
def evaluate(x, y, model):
trn_scores = dict(); vld_scores = dict()
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777) # 10% 테스트, random_state 는 시드값
for t_ind, v_ind in sss.split(x,y):
# split data
x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
y_trn, y_vld = y[t_ind], y[v_ind]
# fit model
model.fit(x_trn, y_trn) # train만 학습시킵니다!
# eval _ trn
preds = model.predict(x_trn)
acc_scores = trn_scores.get('accuracy', [])
acc_scores.append(accuracy_score(y_trn, preds))
trn_scores['accuracy'] = acc_scores
f1_scores = trn_scores.get('f1 score', [])
f1_scores.append(f1_score(y_trn, preds, average='weighted'))
trn_scores['f1 score'] = f1_scores
preds = model.predict_proba(x_trn)
log_scores = trn_scores.get('log loss', [])
log_scores.append(log_loss(y_trn, preds))
trn_scores['log loss'] = log_scores
# eval _ vld
preds = model.predict(x_vld) # predice된 값을 y_vld랑 비교
acc_scores = vld_scores.get('accuracy', [])
acc_scores.append(accuracy_score(y_vld, preds))
vld_scores['accuracy'] = acc_scores
f1_scores = vld_scores.get('f1 score', [])
f1_scores.append(f1_score(y_vld, preds, average='weighted'))
vld_scores['f1 score'] = f1_scores
preds = model.predict_proba(x_vld)
log_scores = vld_scores.get('log loss', [])
log_scores.append(log_loss(y_vld, preds))
vld_scores['log loss'] = log_scores
return trn_scores, vld_scores
def print_scores(trn_scores, vld_scores):
prefix = ' '
cols = ['accuracy', 'f1 score','log loss']
print('='*50)
print('TRAIN EVAL')
for col in cols:
print('-'*50)
print('# {}'.format(col))
print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
print('# {} Raw : {}'.format(prefix, trn_scores[col]))
print('='*50)
print('VALID EVAL')
for col in cols:
print('-'*50)
print('# {}'.format(col))
print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
print('# {} Raw : {}'.format(prefix, vld_scores[col]))
def print_time(end, start):
print('='*50)
elapsed = end - start
print('{} secs'.format(round(elapsed)))
def fit_and_eval(trn, target, model):
trn_scores, vld_scores = evaluate(trn,target,model)
print_scores(trn_scores, vld_scores)
print_time(time.time(), st)
In [12]:
st = time.time()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(n_jobs=-1, random_state=777) #n_jobs= -1 인 경우 자싱니 가진 모든 cpu를 사용함
fit_and_eval(trn, target, model)
# 58 sec
훈련 데이터와 검증 데이터의 평가 척도가 비슷한지 확인을 해야합니다-!
너무 다른 경우 오버피팅의 가능성이 존재해요
로지스틱 regression에서 C의 값은 regularization의 역수임
지금 과적합 상태라면 c의 값을 낮추면 정규성이 증가됨~!!
모델은 복잡도 기준으로 바라볼 것
sroted Feature importance 기준으로 볼 경우 변수의 scale이 다른데 그냥 한번에 돌림..! normalize를 하고 다시 돌려볼 것
In [13]:
# Utility
def observe_model_lr(model):
target_num = 0
print('='*50)
print(model)
print('='*50)
print('# Coefficients for target_num == {}'.format(target_num))
print(model.coef_[target_num])
print('-'*50)
print('# Mapped to Column Name')
prefix = ' '
coefs = dict()
for i, coef in enumerate(model.coef_[target_num]):
print('{} {} \t {}'.format(prefix, round(coef,5), trn.columns[i]))
coefs[trn.columns[i]] = np.absolute(coef)
print('-'*50)
print('# Sorted Feature Importance')
coefs_sorted = sorted(coefs.items(), key=operator.itemgetter(1), reverse=True)
for item in coefs_sorted:
print('{} {} \t {}'.format(prefix, round(item[1],5), item[0]))
return coefs_sorted
def plot_coef(coef):
x = []; y = []
for item in coef:
x.append(item[0])
y.append(item[1])
f, ax = plt.subplots(figsize=(20, 15))
sns.barplot(x,y,alpha=0.5)
ax.set_title('Feature Importance for Model : Logistic Regression')
ax.set(xlabel='Column Name', ylabel='Feature Importance')
In [14]:
# 모델 상세 보기
coef = observe_model_lr(model)
In [ ]:
from datetime import datetime
import os
print('='*50)
print('# Test shape : {}'.format(tst.shape))
model = LogisticRegression(n_jobs=-1, random_state=777)
model.fit(trn,target)
preds = model.predict_proba(tst)
preds = np.fliplr(np.argsort(preds, axis=1)) # 왼쪽이 제일 큰 값이 나오게 설정
In [ ]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
target_cols = [cols[i] for i, col in enumerate(cols) if i in rem_targets]
In [ ]:
final_preds = []
for pred in preds:
top_products = []
for i, product in enumerate(pred):
top_products.append(target_cols[product])
if i == 6:
break
final_preds.append(' '.join(top_products))
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('../output',file_name), index=False)