EDA2


In [111]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
import re 
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import nltk 
from extract_feat_base import * 

try: 
    tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
except:
    print('> trying to download punkt...')
    nltk.download('punkt')
    nltk.data.load('tokenizers/punkt/english.pickle')
    
try: 
    from nltk.corpus import stopwords
except:
    print('> trying to download stopwords...')
    nltk.download('stopwords')
    from nltk.corpus import stopwords

#stops = set(stopwords.words("english"))
stops = set(stopwords.words('russian'))


> trying to download punkt...
[nltk_data] Downloading package punkt to C:\Users\gtesei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

In [112]:
meta = {'target': 'deal_probability', 
        'test_id': 'item_id', 
       'cols': {
           'item_id': 'REM', 
           'user_id': 'CAT', 
           'region': 'CAT', 
           'city':   'CAT', 
           'parent_category_name': 'CAT',
           'category_name': 'CAT',
           'param_1': 'CAT', 
           'param_2': 'CAT', 
           'param_3': 'CAT', 
           'title': 'LEN',  
           'description': 'LEN' , 
           'price': 'NUM', 
           'item_seq_number': 'NUM', 
           'activation_date': 'DATE',           
           'user_type': 'CAT', 
           'image': 'REM',
           'image_top_1': 'NUM'
       }}

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('--------------> Basic Feature Engineering ... ')
all_data , y_train = encode_dataset(train=train[:1000],test=test[:1000],meta=meta)
print(all_data.head())


--------------> Basic Feature Engineering ... 
0 price NUM
1 activation_date DATE
2 user_type CAT
3 title LEN
4 category_name CAT
5 parent_category_name CAT
6 description LEN
7 image REM
8 param_1 CAT
9 param_3 CAT
10 region CAT
11 item_id REM
12 image_top_1 NUM
13 param_2 CAT
14 city CAT
15 item_seq_number NUM
16 user_id CAT
   user_id  region  city  parent_category_name  category_name  param_1  \
0     1747      19    81                     4             41      122   
1      436      17   217                     2             22       59   
2     1123      16   213                     0              2       31   
3     1468      21   163                     4             41       18   
4     1859       4    51                     6              0      139   

   param_2  param_3    price  item_seq_number  activation_date  user_type  \
0       25      112    400.0                2                1          1   
1       25      112   3000.0               19                6          1   
2       25      112   4000.0                9                0          1   
3       25      112   2200.0              286                5          0   
4       32       13  40000.0                3                3          1   

   image_top_1  activation_date_is_holiday  title_len  description_len  
0       1008.0                           0          3                7  
1        692.0                           0          3                7  
2       3032.0                           0          2               17  
3        796.0                           0          1                3  
4       2264.0                           0          3                4  

Avg price per user (-price / avg)


In [117]:
def add_avg_per(df,what_to_avg,on,new_name,include_delta=True,include_perc=True):
    if type(on) == str:
        _full = [on,what_to_avg]
        _fulla = [on,new_name]
    elif type(on) == list:
        _full = on.copy()
        _full.append(what_to_avg)
        _fulla = on.copy()
        _fulla.append(new_name)
    else:
        raise Exception('what type is on!')
    _avg = df.groupby(on)[_full].mean()
    _avg.columns = _fulla
    prev_len = len(df)
    df = df.merge(_avg,how='inner' , on=on)
    assert len(df) == prev_len
    if include_delta:
        df[str(new_name+'_delta')] = df[what_to_avg] - df[new_name]
    if include_perc:
        df[str(new_name+'_perc')] = (df[what_to_avg] - df[new_name])/df[new_name]
    return df

In [118]:
all_data = add_avg_per(df=all_data,what_to_avg='price',on='user_id',new_name='avg_price_usr')
all_data[all_data['user_id']==1318]


Out[118]:
user_id region city parent_category_name category_name param_1 param_2 param_3 price item_seq_number activation_date user_type image_top_1 activation_date_is_holiday title_len description_len avg_price_usr avg_price_usr_delta avg_price_usr_perc
1075 1318 17 217 4 28 61 103 44 200.0 23132 4 0 622.0 0 3 32 200.0 0.0 0.0
1076 1318 17 217 4 28 61 34 44 200.0 23368 0 0 122.0 0 5 35 200.0 0.0 0.0
1077 1318 17 217 4 28 61 77 48 100.0 23054 4 0 562.0 0 3 32 200.0 -100.0 -0.5
1078 1318 17 217 4 28 101 106 53 300.0 22987 3 0 632.0 0 3 32 200.0 100.0 0.5

In [119]:
all_data[all_data['user_id']==993]


Out[119]:
user_id region city parent_category_name category_name param_1 param_2 param_3 price item_seq_number activation_date user_type image_top_1 activation_date_is_holiday title_len description_len avg_price_usr avg_price_usr_delta avg_price_usr_perc
286 993 23 252 5 15 130 0 115 3300000.0 76411 2 2 2220.0 0 6 100 1.926667e+06 1.373333e+06 0.712803
287 993 23 252 5 15 130 0 115 1690000.0 78778 1 2 1295.0 0 6 49 1.926667e+06 -2.366667e+05 -0.122837
288 993 23 252 5 13 130 83 112 790000.0 78812 1 2 2218.0 0 4 40 1.926667e+06 -1.136667e+06 -0.589965

In [122]:
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['user_id','category_name'],new_name='avg_price_usr_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]


Out[122]:
user_id region city parent_category_name category_name param_1 param_2 param_3 price item_seq_number ... image_top_1 activation_date_is_holiday title_len description_len avg_price_usr avg_price_usr_delta avg_price_usr_perc avg_price_usr_cat avg_price_usr_cat_delta avg_price_usr_cat_perc
1075 1318 17 217 4 28 61 103 44 200.0 23132 ... 622.0 0 3 32 200.0 0.0 0.0 200.0 0.0 0.0
1076 1318 17 217 4 28 61 34 44 200.0 23368 ... 122.0 0 5 35 200.0 0.0 0.0 200.0 0.0 0.0
1077 1318 17 217 4 28 61 77 48 100.0 23054 ... 562.0 0 3 32 200.0 -100.0 -0.5 200.0 -100.0 -0.5
1078 1318 17 217 4 28 101 106 53 300.0 22987 ... 632.0 0 3 32 200.0 100.0 0.5 200.0 100.0 0.5

4 rows × 22 columns


In [123]:
# avg_price_city_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['city','category_name'],new_name='avg_price_city_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]


Out[123]:
user_id region city parent_category_name category_name param_1 param_2 param_3 price item_seq_number ... description_len avg_price_usr avg_price_usr_delta avg_price_usr_perc avg_price_usr_cat avg_price_usr_cat_delta avg_price_usr_cat_perc avg_price_city_cat avg_price_city_cat_delta avg_price_city_cat_perc
535 1318 17 217 4 28 61 103 44 200.0 23132 ... 32 200.0 0.0 0.0 200.0 0.0 0.0 783.75 -583.75 -0.744817
536 1318 17 217 4 28 61 34 44 200.0 23368 ... 35 200.0 0.0 0.0 200.0 0.0 0.0 783.75 -583.75 -0.744817
537 1318 17 217 4 28 61 77 48 100.0 23054 ... 32 200.0 -100.0 -0.5 200.0 -100.0 -0.5 783.75 -683.75 -0.872408
538 1318 17 217 4 28 101 106 53 300.0 22987 ... 32 200.0 100.0 0.5 200.0 100.0 0.5 783.75 -483.75 -0.617225

4 rows × 25 columns


In [124]:
# avg_price_region_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['region','category_name'],new_name='avg_price_region_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]


Out[124]:
user_id region city parent_category_name category_name param_1 param_2 param_3 price item_seq_number ... avg_price_usr_perc avg_price_usr_cat avg_price_usr_cat_delta avg_price_usr_cat_perc avg_price_city_cat avg_price_city_cat_delta avg_price_city_cat_perc avg_price_region_cat avg_price_region_cat_delta avg_price_region_cat_perc
678 1318 17 217 4 28 61 103 44 200.0 23132 ... 0.0 200.0 0.0 0.0 783.75 -583.75 -0.744817 1566.956522 -1366.956522 -0.872364
679 1318 17 217 4 28 61 34 44 200.0 23368 ... 0.0 200.0 0.0 0.0 783.75 -583.75 -0.744817 1566.956522 -1366.956522 -0.872364
680 1318 17 217 4 28 61 77 48 100.0 23054 ... -0.5 200.0 -100.0 -0.5 783.75 -683.75 -0.872408 1566.956522 -1466.956522 -0.936182
681 1318 17 217 4 28 101 106 53 300.0 22987 ... 0.5 200.0 100.0 0.5 783.75 -483.75 -0.617225 1566.956522 -1266.956522 -0.808546

4 rows × 28 columns


In [ ]:


In [ ]: