EDA2



In [111]:

    
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder
import re 
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import nltk 
from extract_feat_base import * 

try: 
    tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
except:
    print('> trying to download punkt...')
    nltk.download('punkt')
    nltk.data.load('tokenizers/punkt/english.pickle')
    
try: 
    from nltk.corpus import stopwords
except:
    print('> trying to download stopwords...')
    nltk.download('stopwords')
    from nltk.corpus import stopwords

#stops = set(stopwords.words("english"))
stops = set(stopwords.words('russian'))









    



> trying to download punkt...
[nltk_data] Downloading package punkt to C:\Users\gtesei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



In [112]:

    
meta = {'target': 'deal_probability', 
        'test_id': 'item_id', 
       'cols': {
           'item_id': 'REM', 
           'user_id': 'CAT', 
           'region': 'CAT', 
           'city':   'CAT', 
           'parent_category_name': 'CAT',
           'category_name': 'CAT',
           'param_1': 'CAT', 
           'param_2': 'CAT', 
           'param_3': 'CAT', 
           'title': 'LEN',  
           'description': 'LEN' , 
           'price': 'NUM', 
           'item_seq_number': 'NUM', 
           'activation_date': 'DATE',           
           'user_type': 'CAT', 
           'image': 'REM',
           'image_top_1': 'NUM'
       }}

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('--------------> Basic Feature Engineering ... ')
all_data , y_train = encode_dataset(train=train[:1000],test=test[:1000],meta=meta)
print(all_data.head())









    



--------------> Basic Feature Engineering ... 
0 price NUM
1 activation_date DATE
2 user_type CAT
3 title LEN
4 category_name CAT
5 parent_category_name CAT
6 description LEN
7 image REM
8 param_1 CAT
9 param_3 CAT
10 region CAT
11 item_id REM
12 image_top_1 NUM
13 param_2 CAT
14 city CAT
15 item_seq_number NUM
16 user_id CAT
   user_id  region  city  parent_category_name  category_name  param_1  \
0     1747      19    81                     4             41      122   
1      436      17   217                     2             22       59   
2     1123      16   213                     0              2       31   
3     1468      21   163                     4             41       18   
4     1859       4    51                     6              0      139   

   param_2  param_3    price  item_seq_number  activation_date  user_type  \
0       25      112    400.0                2                1          1   
1       25      112   3000.0               19                6          1   
2       25      112   4000.0                9                0          1   
3       25      112   2200.0              286                5          0   
4       32       13  40000.0                3                3          1   

   image_top_1  activation_date_is_holiday  title_len  description_len  
0       1008.0                           0          3                7  
1        692.0                           0          3                7  
2       3032.0                           0          2               17  
3        796.0                           0          1                3  
4       2264.0                           0          3                4

Avg price per user (-price / avg)



In [117]:

    
def add_avg_per(df,what_to_avg,on,new_name,include_delta=True,include_perc=True):
    if type(on) == str:
        _full = [on,what_to_avg]
        _fulla = [on,new_name]
    elif type(on) == list:
        _full = on.copy()
        _full.append(what_to_avg)
        _fulla = on.copy()
        _fulla.append(new_name)
    else:
        raise Exception('what type is on!')
    _avg = df.groupby(on)[_full].mean()
    _avg.columns = _fulla
    prev_len = len(df)
    df = df.merge(_avg,how='inner' , on=on)
    assert len(df) == prev_len
    if include_delta:
        df[str(new_name+'_delta')] = df[what_to_avg] - df[new_name]
    if include_perc:
        df[str(new_name+'_perc')] = (df[what_to_avg] - df[new_name])/df[new_name]
    return df



In [118]:

    
all_data = add_avg_per(df=all_data,what_to_avg='price',on='user_id',new_name='avg_price_usr')
all_data[all_data['user_id']==1318]









    Out[118]:







  
    
      
      user_id
      region
      city
      parent_category_name
      category_name
      param_1
      param_2
      param_3
      price
      item_seq_number
      activation_date
      user_type
      image_top_1
      activation_date_is_holiday
      title_len
      description_len
      avg_price_usr
      avg_price_usr_delta
      avg_price_usr_perc
    
  
  
    
      1075
      1318
      17
      217
      4
      28
      61
      103
      44
      200.0
      23132
      4
      0
      622.0
      0
      3
      32
      200.0
      0.0
      0.0
    
    
      1076
      1318
      17
      217
      4
      28
      61
      34
      44
      200.0
      23368
      0
      0
      122.0
      0
      5
      35
      200.0
      0.0
      0.0
    
    
      1077
      1318
      17
      217
      4
      28
      61
      77
      48
      100.0
      23054
      4
      0
      562.0
      0
      3
      32
      200.0
      -100.0
      -0.5
    
    
      1078
      1318
      17
      217
      4
      28
      101
      106
      53
      300.0
      22987
      3
      0
      632.0
      0
      3
      32
      200.0
      100.0
      0.5



In [119]:

    
all_data[all_data['user_id']==993]









    Out[119]:







  
    
      
      user_id
      region
      city
      parent_category_name
      category_name
      param_1
      param_2
      param_3
      price
      item_seq_number
      activation_date
      user_type
      image_top_1
      activation_date_is_holiday
      title_len
      description_len
      avg_price_usr
      avg_price_usr_delta
      avg_price_usr_perc
    
  
  
    
      286
      993
      23
      252
      5
      15
      130
      0
      115
      3300000.0
      76411
      2
      2
      2220.0
      0
      6
      100
      1.926667e+06
      1.373333e+06
      0.712803
    
    
      287
      993
      23
      252
      5
      15
      130
      0
      115
      1690000.0
      78778
      1
      2
      1295.0
      0
      6
      49
      1.926667e+06
      -2.366667e+05
      -0.122837
    
    
      288
      993
      23
      252
      5
      13
      130
      83
      112
      790000.0
      78812
      1
      2
      2218.0
      0
      4
      40
      1.926667e+06
      -1.136667e+06
      -0.589965



In [122]:

    
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['user_id','category_name'],new_name='avg_price_usr_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]









    Out[122]:







  
    
      
      user_id
      region
      city
      parent_category_name
      category_name
      param_1
      param_2
      param_3
      price
      item_seq_number
      ...
      image_top_1
      activation_date_is_holiday
      title_len
      description_len
      avg_price_usr
      avg_price_usr_delta
      avg_price_usr_perc
      avg_price_usr_cat
      avg_price_usr_cat_delta
      avg_price_usr_cat_perc
    
  
  
    
      1075
      1318
      17
      217
      4
      28
      61
      103
      44
      200.0
      23132
      ...
      622.0
      0
      3
      32
      200.0
      0.0
      0.0
      200.0
      0.0
      0.0
    
    
      1076
      1318
      17
      217
      4
      28
      61
      34
      44
      200.0
      23368
      ...
      122.0
      0
      5
      35
      200.0
      0.0
      0.0
      200.0
      0.0
      0.0
    
    
      1077
      1318
      17
      217
      4
      28
      61
      77
      48
      100.0
      23054
      ...
      562.0
      0
      3
      32
      200.0
      -100.0
      -0.5
      200.0
      -100.0
      -0.5
    
    
      1078
      1318
      17
      217
      4
      28
      101
      106
      53
      300.0
      22987
      ...
      632.0
      0
      3
      32
      200.0
      100.0
      0.5
      200.0
      100.0
      0.5
    
  

4 rows × 22 columns



In [123]:

    
# avg_price_city_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['city','category_name'],new_name='avg_price_city_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]









    Out[123]:







  
    
      
      user_id
      region
      city
      parent_category_name
      category_name
      param_1
      param_2
      param_3
      price
      item_seq_number
      ...
      description_len
      avg_price_usr
      avg_price_usr_delta
      avg_price_usr_perc
      avg_price_usr_cat
      avg_price_usr_cat_delta
      avg_price_usr_cat_perc
      avg_price_city_cat
      avg_price_city_cat_delta
      avg_price_city_cat_perc
    
  
  
    
      535
      1318
      17
      217
      4
      28
      61
      103
      44
      200.0
      23132
      ...
      32
      200.0
      0.0
      0.0
      200.0
      0.0
      0.0
      783.75
      -583.75
      -0.744817
    
    
      536
      1318
      17
      217
      4
      28
      61
      34
      44
      200.0
      23368
      ...
      35
      200.0
      0.0
      0.0
      200.0
      0.0
      0.0
      783.75
      -583.75
      -0.744817
    
    
      537
      1318
      17
      217
      4
      28
      61
      77
      48
      100.0
      23054
      ...
      32
      200.0
      -100.0
      -0.5
      200.0
      -100.0
      -0.5
      783.75
      -683.75
      -0.872408
    
    
      538
      1318
      17
      217
      4
      28
      101
      106
      53
      300.0
      22987
      ...
      32
      200.0
      100.0
      0.5
      200.0
      100.0
      0.5
      783.75
      -483.75
      -0.617225
    
  

4 rows × 25 columns



In [124]:

    
# avg_price_region_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['region','category_name'],new_name='avg_price_region_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]









    Out[124]:







  
    
      
      user_id
      region
      city
      parent_category_name
      category_name
      param_1
      param_2
      param_3
      price
      item_seq_number
      ...
      avg_price_usr_perc
      avg_price_usr_cat
      avg_price_usr_cat_delta
      avg_price_usr_cat_perc
      avg_price_city_cat
      avg_price_city_cat_delta
      avg_price_city_cat_perc
      avg_price_region_cat
      avg_price_region_cat_delta
      avg_price_region_cat_perc
    
  
  
    
      678
      1318
      17
      217
      4
      28
      61
      103
      44
      200.0
      23132
      ...
      0.0
      200.0
      0.0
      0.0
      783.75
      -583.75
      -0.744817
      1566.956522
      -1366.956522
      -0.872364
    
    
      679
      1318
      17
      217
      4
      28
      61
      34
      44
      200.0
      23368
      ...
      0.0
      200.0
      0.0
      0.0
      783.75
      -583.75
      -0.744817
      1566.956522
      -1366.956522
      -0.872364
    
    
      680
      1318
      17
      217
      4
      28
      61
      77
      48
      100.0
      23054
      ...
      -0.5
      200.0
      -100.0
      -0.5
      783.75
      -683.75
      -0.872408
      1566.956522
      -1466.956522
      -0.936182
    
    
      681
      1318
      17
      217
      4
      28
      101
      106
      53
      300.0
      22987
      ...
      0.5
      200.0
      100.0
      0.5
      783.75
      -483.75
      -0.617225
      1566.956522
      -1266.956522
      -0.808546
    
  

4 rows × 28 columns



In [ ]:



In [ ]:

	user_id	region	city	parent_category_name	category_name	param_1	param_2	param_3	price	item_seq_number	activation_date	image_top_1	title_len	description_len	avg_price_usr	avg_price_usr_delta	avg_price_usr_perc
1075	1318	17	217	4	28	61	103	44	200.0	23132	4	622.0	3	32	200.0	0.0	0.0
1076	1318	17	217	4	28	61	34	44	200.0	23368	0	122.0	5	35	200.0	0.0	0.0
1077	1318	17	217	4	28	61	77	48	100.0	23054	4	562.0	3	32	200.0	-100.0	-0.5
1078	1318	17	217	4	28	101	106	53	300.0	22987	3	632.0	3	32	200.0	100.0	0.5

	user_id	region	city	parent_category_name	category_name	param_1	param_2	param_3	price	item_seq_number	activation_date	user_type	image_top_1	title_len	description_len	avg_price_usr	avg_price_usr_delta	avg_price_usr_perc
286	993	23	252	5	15	130	0	115	3300000.0	76411	2	2	2220.0	6	100	1.926667e+06	1.373333e+06	0.712803
287	993	23	252	5	15	130	0	115	1690000.0	78778	1	2	1295.0	6	49	1.926667e+06	-2.366667e+05	-0.122837
288	993	23	252	5	13	130	83	112	790000.0	78812	1	2	2218.0	4	40	1.926667e+06	-1.136667e+06	-0.589965