In [1]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from scipy.stats import ks_2samp
import numpy as np
from corgi.plots import numplot, catplot

pd.options.display.max_info_columns = 200
# %matplotlib inline
sns.set_style('whitegrid')

In [2]:
target = 'PD_12_90'
ID = 'REQUEST_ID'
date = 'OPEN_DATE1'
isTrain = 'isTrain'

In [3]:
train1 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P1.csv', encoding='cp1251', sep=';')
train2 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P2.csv', encoding='cp1251', sep=';')
train3 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P3.csv', encoding='cp1251', sep=';')
train4 = pd.read_csv('data/BAZA_SC11_ALGOMOST_BOY_TRAYN_P4.csv', encoding='cp1251', sep=';')
test = pd.read_csv('data/baza_SC11_ALGOMOST_boy_test.csv', encoding='cp1251', sep=';')

train = pd.concat([train1, train2, train3, train4], axis=0).reset_index(drop=True)

In [4]:
train[date] = pd.to_datetime(train[date], format="%d/%m/%y")
test[date] = pd.to_datetime(test[date], format="%d/%m/%y")

In [18]:
# train.info()

In [26]:
# feature = 'tu_regions_id_fact_zacr'
feature = 'MAX_all_symb'
fig = catplot(feature, train, test, target=target, date=date, clip=0.01, figsize=(15, 6), rank=18)
fig


Out[26]:

In [25]:
feature = 'salary'
fig = numplot(feature, train, test, target=target, date=date, clips=[0.01, 0.99], figsize=(20, 2), rank=18)
fig


Out[25]:

Numeric plot


In [25]:
feature = 'age'


def quantile_func(q_value):
    return lambda x: np.percentile(x, q=q_value, interpolation='nearest')

def numplot(feature, train, test, target, date, frequency='1M',
             clips=[0, 1.0], grid=False, rank=None):

    
    # a
    fig = plt.figure(1, figsize=(15, 6))
    ax  = plt.subplot2grid((8,7), (3,0), rowspan=5, colspan=5)
    ax2 = plt.subplot2grid((8,7), (3,5), rowspan=5, sharey=ax)
    ax3 = plt.subplot2grid((8,7), (3,6), rowspan=5, sharey=ax)
    ax4 = plt.subplot2grid((8,7), (0,0), rowspan=1, colspan=5)
    ax5 = plt.subplot2grid((8,7), (1,0), rowspan=2, colspan=5)
    ax6 = plt.subplot2grid((8,7), (0,5), rowspan=2, colspan=2)


    
    train_feature = pd.Series(train[feature].values, index=train[date].values)
    test_feature =  pd.Series(test[feature].values, index=test[date].values)
    data_feature = train_feature.append(test_feature)
    
    train_max_date = train_feature.index.max()
    test_min_date = test_feature.index.min()
    
    data_clips = data_feature.quantile(clips).values
    data_feature = data_feature.clip(data_clips[0], data_clips[1])
    train_feature = train_feature.clip(data_clips[0], data_clips[1])
    test_feature = test_feature.clip(data_clips[0], data_clips[1])

    # ax2
    sns.kdeplot(train_feature[train_feature.notnull()], shade=True, label=u'train', vertical=True, ax=ax2)
    sns.kdeplot(test_feature[test_feature.notnull()], shade=True, label=u'test', vertical=True, ax=ax2)
    ax2.set_title('KS: %g' % ks_2samp(train_feature[train_feature.notnull()],
                                      test_feature[test_feature.notnull()])[0])
    ax2.grid()
    ax2.set_xticks([])
    
    # ax3
    target_mask = pd.Series(train[target].values, index=train[date].values).astype(bool)
    sns.kdeplot(train_feature[~target_mask & train_feature.notnull()], shade=True, label=u'0', vertical=True, ax=ax3, color='red', alpha=0.3)
    sns.kdeplot(train_feature[target_mask  & train_feature.notnull()], shade=True, label=u'1', vertical=True, ax=ax3, color='purple', alpha=0.3)
    ax3.set_title('KS: %g' % ks_2samp(train_feature[target_mask], train_feature[~target_mask])[0])
    ax3.grid()
    ax3.set_xticks([])

    # ax
    ax_feature = data_feature[data_feature.notnull()].resample(frequency).agg([np.mean, np.std, min, max,
                                                                             quantile_func(5),
                                                                             quantile_func(25),
                                                                             quantile_func(75),
                                                                             quantile_func(95)])

    ax_feature.columns = ['mean', 'std', 'min', 'max', 'q5', 'q25', 'q75', 'q95']
    ax_feature['mean'].plot(ax=ax, markersize=5, marker='o', color='black', label='mean')
    ax_feature['min'].plot(ax=ax, color='grey', linestyle='--')
    ax_feature['max'].plot(ax=ax, color='grey', linestyle='--')


    ax_feature_train = ax_feature[(ax_feature.index <=  train_max_date) | 
                                  ((ax_feature.index <  test_min_date) & 
                                   (ax_feature.index > train_max_date))]
    ax_feature_test = ax_feature[ax_feature.index >=  test_min_date]

    ax.fill_between(ax_feature_train.index, ax_feature_train['min'], ax_feature_train['max'], alpha=0.05, color='b')
    ax.fill_between(ax_feature_train.index, ax_feature_train['q5'] , ax_feature_train['q95'], alpha=0.05, color='b')
    ax.fill_between(ax_feature_train.index, ax_feature_train['q25'], ax_feature_train['q75'], alpha=0.05, color='b')
    ax.fill_between(ax_feature_test.index, ax_feature_test['min'], ax_feature_test['max'], alpha=0.05, color='g')
    ax.fill_between(ax_feature_test.index, ax_feature_test['q5'] , ax_feature_test['q95'], alpha=0.05, color='g')
    ax.fill_between(ax_feature_test.index, ax_feature_test['q25'], ax_feature_test['q75'], alpha=0.05, color='g')
    ax.fill_betweenx([(ax_feature_train['min'].iloc[-1] + ax_feature_test['min'].iloc[-1]) /2., 
                      (ax_feature_train['max'].iloc[-1] + ax_feature_test['max'].iloc[-1]) /2.],
                     [ax_feature_train.index.max(), ax_feature_train.index.max()],
                     [ax_feature_test.index.min(), ax_feature_test.index.min()], color='grey', alpha=0.2)
    ax.grid()
    ax.set_xticks(ax_feature.index)
    ax.set_ylabel(feature)
    if grid:
        ax.grid()


    # ax4
    temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).count()
    temp = temp / temp.sum()
    temp.plot(ax=ax4, use_index=False, color='grey', marker='o', markersize=5, label='Count %')
    ax4.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
    ax4.grid()
    ax4.set_yticks([])
    ax4.set_xticks([])
    ax4.legend()
    
    # ax5
    
    if data_feature.isnull().sum() > 0:
        temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).mean()
        temp.plot(ax=ax5, use_index=False, color='grey', marker='o', markersize=5, label='NaN %')
        ax5.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
        
        temp = temp / temp.max()
        temp.plot(ax=ax5, use_index=False, color='grey', label='NaN rel %', linestyle='--')
        ax5.legend()
    ax5.set_xticks([])
    ax5.set_ylim([0, 1])
    ax5.set_yticks([])
    
    # ax6
    ax6.set_xticks([])
    ax6.set_yticks([])
    if rank:
        ax6.text(0.2, 0.4, 'Rank:      {0}'.format(rank), fontsize=18)
    return fig
            
fig = numplot(feature, train, test, target=target, date=date, clips=[0.01, 0.99], rank=18)

In [26]:
fig


Out[26]:

In [42]:
feature = 'CC_CASH'


def quantile_func(q_value):
    return lambda x: np.percentile(x, q=q_value, interpolation='nearest')

def catplot(feature, train, test, target, date, frequency='1M', clip=0.01, grid=False, rank=None):

    fig = plt.figure(feature, figsize=(15, 6))
    ax  = plt.subplot2grid((6,7), (1,0), rowspan=5, colspan=5)
    ax2 = plt.subplot2grid((6,7), (1,6), rowspan=5, colspan=1)
    ax3 = plt.subplot2grid((6,7), (0,0), rowspan=1, colspan=5)
    ax4 = plt.subplot2grid((6,7), (0,6), rowspan=1, colspan=1)
  
    train_feature = pd.Series(train[feature].values, index=train[date].values).fillna('NaN').astype(str)
    test_feature =  pd.Series(test[feature].values, index=test[date].values).fillna('NaN').astype(str)
    data_feature = pd.concat([train_feature, test_feature], axis=0).fillna('NaN').astype(str)
    
    train_max_date = train_feature.index.max()
    test_min_date = test_feature.index.min()
    
    vc = data_feature.value_counts() / data_feature.shape[0]
    to_join_other = vc[vc < clip].index
    data_feature.loc[data_feature.isin(to_join_other)] = 'OTHER'
    train_feature.loc[train_feature.isin(to_join_other)] = 'OTHER'
    test_feature.loc[test_feature.isin(to_join_other)] = 'OTHER'

    # ax
    ax_feature = data_feature.groupby((pd.TimeGrouper('1M'), data_feature)).count()
    ax_feature = ax_feature.to_frame().unstack()
    ax_feature.columns = ax_feature.columns.droplevel()
    ax_feature = ax_feature.T
    ax_feature = (ax_feature/ ax_feature.sum(axis=0)).fillna(0)
    temp = ax_feature.copy().T
    order = temp.max(axis=0)
    order = order.sort_values(ascending=False)
    current_palette = sns.color_palette('pastel') + sns.color_palette('muted')
    ax_feature = ax_feature.loc[order.index]
    ax_feature = ax_feature.cumsum(axis=0).T
    ax_feature = -ax_feature
    ax_feature += pd.DataFrame(np.ones(ax_feature.shape), index=ax_feature.index, columns=ax_feature.columns)
    ax_feature *= 100

    ax_feature_train = ax_feature[(ax_feature.index <=  train_max_date) | 
                                  ((ax_feature.index <  test_min_date) & 
                                   (ax_feature.index > train_max_date))]
    ax_feature_test = ax_feature[ax_feature.index >=  test_min_date]
    
    def _plot(temp, ax, order, current_pallete, isTrain=True):
        for i, (f, c) in enumerate(zip(order.index, current_palette)):
            temp[f].plot(ax=ax, color='grey', grid=False, label='_nolegend_')
            label = f if isTrain else '_nolegend_'
            alpha=0.5

            if i == 0:
                ax.fill_between(temp.index, [100] * temp.shape[0], temp.iloc[:, 0], color=c, alpha=alpha, label=label)
            else:
                ax.fill_between(temp.index, temp.iloc[:, i-1], temp.iloc[:, i], color=c, alpha=alpha, label=label)
                
   
    _plot(ax_feature_train, ax, order, current_palette, isTrain=True)
    _plot(ax_feature_test, ax, order, current_palette, isTrain=False)
    ax.fill_betweenx([0, 100],
                     [ax_feature_train.index.max(), ax_feature_train.index.max()],
                     [ax_feature_test.index.min(), ax_feature_test.index.min()], color='grey', alpha=0.2)    

    ax.legend(bbox_to_anchor=(1.15, 1.01))
    ax.set_ylim([0, 100])
    ax.set_ylabel('Part of sample, %')
    if grid:
        ax.grid()
    
    # ax3
    temp = pd.Series(data_feature.isnull(), index=data_feature.index).resample(frequency).count()
    temp = temp / temp.sum()
    temp.plot(ax=ax3, use_index=False, color='grey', marker='o', markersize=5, label='Count %')
    ax3.fill_between(range(0, temp.shape[0]), [0] * temp.shape[0], temp.values, alpha=0.5, color='grey')
    ax3.grid()
    ax3.set_yticks([])
    ax3.set_xticks([])
    ax3.legend()
    ax3.set_title(feature)
    
    # ax2
    g = train.groupby(train_feature.values)[target].mean()
    g = g.reset_index()
    g.columns = ['val', target]
    g.val = g.val.astype(str)
    sns.barplot(x=target, y='val', data=g, palette=current_palette, order=order.index.astype(str), alpha=0.5, ax=ax2)
    ax2.grid(False)
    ax2.set_xlabel('Mean of TARGET')
    ax2.set_ylabel('')
    base = train[target].mean()
    ax2.plot([base, base], ax2.get_ylim(), linestyle='--', color='grey')
    ax2.set_yticks([])
    ax2.set_xticks([round(i, 2) for i in ax2.get_xlim() + (base, )])
    
    
    # ax4
    ax4.set_xticks([])
    ax4.set_yticks([])
    if rank is None:
        rank = 'NA'
    ax4.text(0.05, 0.4, 'Rank:      {0}'.format(rank), fontsize=16)
    return fig
    
fig = catplot(feature, train, test, target=target, date=date, clip=0.005, rank=18)

In [47]:



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-8f2fd4740d22> in <module>()
----> 1 fig = catplot(feature, train, test, target=target, date=date, clip=0.005, figsize=(15, 6), rank=18)
      2 fig

/home/anton/anaconda2/envs/p36_corgi/lib/python3.6/site-packages/corgi/plots.py in catplot(feature, train, test, target, date, frequency, clip, grid, figsize, rank)
    141     train_feature = pd.Series(train[feature].values, index=train[date].values).fillna('NaN').astype(str)
    142     test_feature = pd.Series(test[feature].values, index=test[date].values).fillna('NaN').astype(str)
--> 143     data_feature = train_feature.append(test_feature).fillna('NaN').astype(str)
    144 
    145     vc = data_feature.value_counts() / data_feature.shape[0]

/home/anton/anaconda2/envs/p36_corgi/lib/python3.6/site-packages/pandas/core/generic.py in __getattr__(self, name)
   2968             if name in self._info_axis:
   2969                 return self[name]
-> 2970             return object.__getattribute__(self, name)
   2971 
   2972     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'appned'

In [43]:
fig


Out[43]:

In [ ]:
pd.Series([1] )

In [29]:
train.CC_CASH.value_counts()


Out[29]:
КЭШ    1466243
КК      718257
Name: CC_CASH, dtype: int64

In [ ]: