In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import nltk
from extract_feat_base import *
try:
tokenizer = nltk.data.load('tokenizers/punkt/russian.pickle')
except:
print('> trying to download punkt...')
nltk.download('punkt')
nltk.data.load('tokenizers/punkt/english.pickle')
try:
from nltk.corpus import stopwords
except:
print('> trying to download stopwords...')
nltk.download('stopwords')
from nltk.corpus import stopwords
#stops = set(stopwords.words("english"))
stops = set(stopwords.words('russian'))
In [112]:
meta = {'target': 'deal_probability',
'test_id': 'item_id',
'cols': {
'item_id': 'REM',
'user_id': 'CAT',
'region': 'CAT',
'city': 'CAT',
'parent_category_name': 'CAT',
'category_name': 'CAT',
'param_1': 'CAT',
'param_2': 'CAT',
'param_3': 'CAT',
'title': 'LEN',
'description': 'LEN' ,
'price': 'NUM',
'item_seq_number': 'NUM',
'activation_date': 'DATE',
'user_type': 'CAT',
'image': 'REM',
'image_top_1': 'NUM'
}}
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print('--------------> Basic Feature Engineering ... ')
all_data , y_train = encode_dataset(train=train[:1000],test=test[:1000],meta=meta)
print(all_data.head())
In [117]:
def add_avg_per(df,what_to_avg,on,new_name,include_delta=True,include_perc=True):
if type(on) == str:
_full = [on,what_to_avg]
_fulla = [on,new_name]
elif type(on) == list:
_full = on.copy()
_full.append(what_to_avg)
_fulla = on.copy()
_fulla.append(new_name)
else:
raise Exception('what type is on!')
_avg = df.groupby(on)[_full].mean()
_avg.columns = _fulla
prev_len = len(df)
df = df.merge(_avg,how='inner' , on=on)
assert len(df) == prev_len
if include_delta:
df[str(new_name+'_delta')] = df[what_to_avg] - df[new_name]
if include_perc:
df[str(new_name+'_perc')] = (df[what_to_avg] - df[new_name])/df[new_name]
return df
In [118]:
all_data = add_avg_per(df=all_data,what_to_avg='price',on='user_id',new_name='avg_price_usr')
all_data[all_data['user_id']==1318]
Out[118]:
In [119]:
all_data[all_data['user_id']==993]
Out[119]:
In [122]:
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['user_id','category_name'],new_name='avg_price_usr_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]
Out[122]:
In [123]:
# avg_price_city_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['city','category_name'],new_name='avg_price_city_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]
Out[123]:
In [124]:
# avg_price_region_cat
all_data = add_avg_per(df=all_data,what_to_avg='price',on=['region','category_name'],new_name='avg_price_region_cat',include_delta=True,include_perc=True)
all_data[all_data['user_id']==1318]
Out[124]:
In [ ]:
In [ ]: