In [1]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from scipy.stats import ks_2samp
import numpy as np
from corgi.plots import numplot, catplot
pd.options.display.max_info_columns = 200
# %matplotlib inline
sns.set_style('whitegrid')
In [2]:
target = 'PD_12_90'
ID = 'REQUEST_ID'
date = 'OPEN_DATE1'
isTrain = 'isTrain'
In [3]:
train1 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P1.csv', encoding='cp1251', sep=';')
train2 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P2.csv', encoding='cp1251', sep=';')
train3 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P3.csv', encoding='cp1251', sep=';')
train4 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P4.csv', encoding='cp1251', sep=';')
test = pd.read_csv('data/baza_SC11_ALGOMOST_boy_test.csv', encoding='cp1251', sep=';')
train = pd.concat([train1, train2, train3, train4], axis=0).reset_index(drop=True)
In [4]:
train[date] = pd.to_datetime(train[date], format="%d/%m/%y")
test[date] = pd.to_datetime(test[date], format="%d/%m/%y")
In [18]:
# train.info()
In [26]:
# feature = 'tu_regions_id_fact_zacr'
feature = 'MAX_all_symb'
fig = catplot(feature, train, test, target=target, date=date, clip=0.01, figsize=(15, 6), rank=18)
fig
Out[26]:
In [25]:
feature = 'salary'
fig = numplot(feature, train, test, target=target, date=date, clips=[0.01, 0.99], figsize=(20, 2), rank=18)
fig
Out[25]:
In [25]:
feature = 'age'
def quantile_func(q_value):
return lambda x: np.percentile(x, q=q_value, interpolation='nearest')
def numplot(feature, train, test, target, date, frequency='1M',
clips=[0, 1.0], grid=False, rank=None):
# a
fig = plt.figure(1, figsize=(15, 6))
ax = plt.subplot2grid((8,7), (3,0), rowspan=5, colspan=5)
ax2 = plt.subplot2grid((8,7), (3,5), rowspan=5, sharey=ax)
ax3 = plt.subplot2grid((8,7), (3,6), rowspan=5, sharey=ax)
ax4 = plt.subplot2grid((8,7), (0,0), rowspan=1, colspan=5)
ax5 = plt.subplot2grid((8,7), (1,0), rowspan=2, colspan=5)
ax6 = plt.subplot2grid((8,7), (0,5), rowspan=2, colspan=2)
train_feature = pd.Series(train[feature].values, index=train[date].values)
test_feature = pd.Series(test[feature].values, index=test[date].values)
data_feature = train_feature.append(test_feature)
train_max_date = train_feature.index.max()
test_min_date = test_feature.index.min()
data_clips = data_feature.quantile(clips).values
data_feature = data_feature.clip(data_clips[0], data_clips[1])
train_feature = train_feature.clip(data_clips[0], data_clips[1])
test_feature = test_feature.clip(data_clips[0], data_clips[1])
# ax2
sns.kdeplot(train_feature[train_feature.notnull()], shade=True, label=u'train', vertical=True, ax=ax2)
sns.kdeplot(test_feature[test_feature.notnull()], shade=True, label=u'test', vertical=True, ax=ax2)
ax2.set_title('KS: %g' % ks_2samp(train_feature[train_feature.notnull()],
test_feature[test_feature.notnull()])[0])
ax2.grid()
ax2.set_xticks([])
# ax3
target_mask = pd.Series(train[target].values, index=train[date].values).astype(bool)
sns.kdeplot(train_feature[~target_mask & train_feature.notnull()], shade=True, label=u'0', vertical=True, ax=ax3, color='red', alpha=0.3)
sns.kdeplot(train_feature[target_mask & train_feature.notnull()], shade=True, label=u'1', vertical=True, ax=ax3, color='purple', alpha=0.3)
ax3.set_title('KS: %g' % ks_2samp(train_feature[target_mask], train_feature[~target_mask])[0])
ax3.grid()
ax3.set_xticks([])
# ax
ax_feature = data_feature[data_feature.notnull()].resample(frequency).agg([np.mean, np.std, min, max,
quantile_func(5),
quantile_func(25),
quantile_func(75),
quantile_func(95)])
ax_feature.columns = ['mean', 'std', 'min', 'max', 'q5', 'q25', 'q75', 'q95']
ax_feature['mean'].plot(ax=ax, markersize=5, marker='o', color='black', label='mean')
ax_feature['min'].plot(ax=ax, color='grey', linestyle='--')
ax_feature['max'].plot(ax=ax, color='grey', linestyle='--')
ax_feature_train = ax_feature[(ax_feature.index <= train_max_date) |
((ax_feature.index < test_min_date) &
(ax_feature.index > train_max_date))]
ax_feature_test = ax_feature[ax_feature.index >= test_min_date]
ax.fill_between(ax_feature_train.index, ax_feature_train['min'], ax_feature_train['max'], alpha=0.05, color='b')
ax.fill_between(ax_feature_train.index, ax_feature_train['q5'] , ax_feature_train['q95'], alpha=0.05, color='b')
ax.fill_between(ax_feature_train.index, ax_feature_train['q25'], ax_feature_train['q75'], alpha=0.05, color='b')
ax.fill_between(ax_feature_test.index, ax_feature_test['min'], ax_feature_test['max'], alpha=0.05, color='g')
ax.fill_between(ax_feature_test.index, ax_feature_test['q5'] , ax_feature_test['q95'], alpha=0.05, color='g')
ax.fill_between(ax_feature_test.index, ax_feature_test['q25'], ax_feature_test['q75'], alpha=0.05, color='g')
ax.fill_betweenx([(ax_feature_train['min'].iloc[-1] + ax_feature_test['min'].iloc[-1]) /2.,
(ax_feature_train['max'].iloc[-1] + ax_feature_test['max'].iloc[-1]) /2.],
[ax_feature_train.index.max(), ax_feature_train.index.max()],
[ax_feature_test.index.min(), ax_feature_test.index.min()], color='grey', alpha=0.2)
ax.grid()
ax.set_xticks(ax_feature.index)
ax.set_ylabel(feature)
if grid:
ax.grid()
# ax4
temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).count()
temp = temp / temp.sum()
temp.plot(ax=ax4, use_index=False, color='grey', marker='o', markersize=5, label='Count %')
ax4.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
ax4.grid()
ax4.set_yticks([])
ax4.set_xticks([])
ax4.legend()
# ax5
if data_feature.isnull().sum() > 0:
temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).mean()
temp.plot(ax=ax5, use_index=False, color='grey', marker='o', markersize=5, label='NaN %')
ax5.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
temp = temp / temp.max()
temp.plot(ax=ax5, use_index=False, color='grey', label='NaN rel %', linestyle='--')
ax5.legend()
ax5.set_xticks([])
ax5.set_ylim([0, 1])
ax5.set_yticks([])
# ax6
ax6.set_xticks([])
ax6.set_yticks([])
if rank:
ax6.text(0.2, 0.4, 'Rank: {0}'.format(rank), fontsize=18)
return fig
fig = numplot(feature, train, test, target=target, date=date, clips=[0.01, 0.99], rank=18)
In [26]:
fig
Out[26]:
In [42]:
feature = 'CC_CASH'
def quantile_func(q_value):
return lambda x: np.percentile(x, q=q_value, interpolation='nearest')
def catplot(feature, train, test, target, date, frequency='1M', clip=0.01, grid=False, rank=None):
fig = plt.figure(feature, figsize=(15, 6))
ax = plt.subplot2grid((6,7), (1,0), rowspan=5, colspan=5)
ax2 = plt.subplot2grid((6,7), (1,6), rowspan=5, colspan=1)
ax3 = plt.subplot2grid((6,7), (0,0), rowspan=1, colspan=5)
ax4 = plt.subplot2grid((6,7), (0,6), rowspan=1, colspan=1)
train_feature = pd.Series(train[feature].values, index=train[date].values).fillna('NaN').astype(str)
test_feature = pd.Series(test[feature].values, index=test[date].values).fillna('NaN').astype(str)
data_feature = pd.concat([train_feature, test_feature], axis=0).fillna('NaN').astype(str)
train_max_date = train_feature.index.max()
test_min_date = test_feature.index.min()
vc = data_feature.value_counts() / data_feature.shape[0]
to_join_other = vc[vc < clip].index
data_feature.loc[data_feature.isin(to_join_other)] = 'OTHER'
train_feature.loc[train_feature.isin(to_join_other)] = 'OTHER'
test_feature.loc[test_feature.isin(to_join_other)] = 'OTHER'
# ax
ax_feature = data_feature.groupby((pd.TimeGrouper('1M'), data_feature)).count()
ax_feature = ax_feature.to_frame().unstack()
ax_feature.columns = ax_feature.columns.droplevel()
ax_feature = ax_feature.T
ax_feature = (ax_feature/ ax_feature.sum(axis=0)).fillna(0)
temp = ax_feature.copy().T
order = temp.max(axis=0)
order = order.sort_values(ascending=False)
current_palette = sns.color_palette('pastel') + sns.color_palette('muted')
ax_feature = ax_feature.loc[order.index]
ax_feature = ax_feature.cumsum(axis=0).T
ax_feature = -ax_feature
ax_feature += pd.DataFrame(np.ones(ax_feature.shape), index=ax_feature.index, columns=ax_feature.columns)
ax_feature *= 100
ax_feature_train = ax_feature[(ax_feature.index <= train_max_date) |
((ax_feature.index < test_min_date) &
(ax_feature.index > train_max_date))]
ax_feature_test = ax_feature[ax_feature.index >= test_min_date]
def _plot(temp, ax, order, current_pallete, isTrain=True):
for i, (f, c) in enumerate(zip(order.index, current_palette)):
temp[f].plot(ax=ax, color='grey', grid=False, label='_nolegend_')
label = f if isTrain else '_nolegend_'
alpha=0.5
if i == 0:
ax.fill_between(temp.index, [100] * temp.shape[0], temp.iloc[:, 0], color=c, alpha=alpha, label=label)
else:
ax.fill_between(temp.index, temp.iloc[:, i-1], temp.iloc[:, i], color=c, alpha=alpha, label=label)
_plot(ax_feature_train, ax, order, current_palette, isTrain=True)
_plot(ax_feature_test, ax, order, current_palette, isTrain=False)
ax.fill_betweenx([0, 100],
[ax_feature_train.index.max(), ax_feature_train.index.max()],
[ax_feature_test.index.min(), ax_feature_test.index.min()], color='grey', alpha=0.2)
ax.legend(bbox_to_anchor=(1.15, 1.01))
ax.set_ylim([0, 100])
ax.set_ylabel('Part of sample, %')
if grid:
ax.grid()
# ax3
temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).count()
temp = temp / temp.sum()
temp.plot(ax=ax3, use_index=False, color='grey', marker='o', markersize=5, label='Count %')
ax3.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
ax3.grid()
ax3.set_yticks([])
ax3.set_xticks([])
ax3.legend()
ax3.set_title(feature)
# ax2
g = train.groupby(train_feature.values)[target].mean()
g = g.reset_index()
g.columns = ['val', target]
g.val = g.val.astype(str)
sns.barplot(x=target, y='val', data=g, palette=current_palette, order=order.index.astype(str), alpha=0.5, ax=ax2)
ax2.grid(False)
ax2.set_xlabel('Mean of TARGET')
ax2.set_ylabel('')
base = train[target].mean()
ax2.plot([base, base], ax2.get_ylim(), linestyle='--', color='grey')
ax2.set_yticks([])
ax2.set_xticks([round(i, 2) for i in ax2.get_xlim() + (base, )])
# ax4
ax4.set_xticks([])
ax4.set_yticks([])
if rank is None:
rank = 'NA'
ax4.text(0.05, 0.4, 'Rank: {0}'.format(rank), fontsize=16)
return fig
fig = catplot(feature, train, test, target=target, date=date, clip=0.005, rank=18)
In [47]:
In [43]:
fig
Out[43]:
In [ ]:
pd.Series([1] )
In [29]:
train.CC_CASH.value_counts()
Out[29]:
In [ ]: