In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
import os
#chanhe this manualy
path = '/media/roman/Main/Programing/contest/dmc2017/dmc-2017/'
os.chdir(path)
In [2]:
items = pd.read_csv('data/raw/items.csv',sep='|')
train = pd.read_csv('data/raw/train.csv',sep='|')
In [3]:
train.head(5)
Out[3]:
In [4]:
items.head(5)
Out[4]:
In [5]:
train.info(null_counts=True)
In [6]:
items.info()
In [7]:
train_items=pd.merge(train,items,on='pid')
del train
gc.collect
Out[7]:
In [8]:
def toCategorical(df):
columns=['availability','group','content','unit','pharmForm',
'campaignIndex','salesIndex', 'category', 'manufacturer']
for col in columns:
if col in df.columns:
df[col]=df[col].astype('category')
return df
In [9]:
train_items=toCategorical(train_items)
In [10]:
t1=train_items['rrp'].as_matrix()
t2=train_items['competitorPrice'].as_matrix()
t2_div_t1=t2/t1
t2_div_t1=t2_div_t1[np.logical_not(np.isnan(t2_div_t1))]
coef_competitorPrice_to_rrp=t2_div_t1.mean()
print coef_competitorPrice_to_rrp
In [11]:
def solveNA(df,df2,coef,flag):
if flag==1:
df['pharmForm'] = df['pharmForm'].fillna('no_pharmForm')
df['category'] = df['category'].fillna(410)
df['campaignIndex'] = df['campaignIndex'].fillna('D')
else:
df['competitorPrice'] = df['competitorPrice'].fillna(df2['rrp']*coef)
if 'pharmForm' in df.columns:
df['pharmForm'] = df['pharmForm'].cat.add_categories(['no_pharmForm'])
df['pharmForm'] = df['pharmForm'].fillna('no_pharmForm')
if 'category' in df.columns:
df['category'] = df['category'].cat.add_categories([410])
df['category'] = df['category'].fillna(410)
if 'campaignIndex' in df.columns:
df['campaignIndex'] = df['campaignIndex'].cat.add_categories(['D'])
df['campaignIndex'] = df['campaignIndex'].fillna('D')
columns2=['category', 'manufacturer']
for col2 in columns2:
if col2 in df.columns:
df[col2]=df[col2].astype('int')
df[col2]=df[col2].astype('category')
return df
In [12]:
items=solveNA(items,train_items,coef_competitorPrice_to_rrp,1)
train_items=solveNA(train_items,train_items,coef_competitorPrice_to_rrp,0)
In [13]:
train_items.info(null_counts=True)
In [14]:
def Dummies(df):
columns=['availability','unit','salesIndex','campaignIndex']
dumm=pd.get_dummies(df[columns])
df=pd.concat([df, dumm], axis=1)
return df
In [15]:
train_items=Dummies(train_items)
In [16]:
def moreFeautures(df):
df['day_of_week']=df['day']%7
df['discount']=df['price']/df['rrp']
df['compDiscount']=df['competitorPrice']/df['price']
return df
In [17]:
train_items=moreFeautures(train_items)
In [18]:
train_items.info()
In [19]:
import itertools as it
def solveCategorical(c1,df1,df2,flag):
def f(x):
tuples = [tuple(i) for i in x.values]
tuples.reverse()
res=[]
for i in xrange(len(tuples)):
try:
res.append(t2[tuples.pop()])
except:
res.append(np.nan)
return res
columns=['group','content','pharmForm','category','manufacturer']
for L in range(1, 4):
for col in it.combinations(columns, L):
print col
t1=df1.groupby(list(col))
t2=dict(t1[c1].mean())
str1='_'.join(col)
if len(col)==1:
df2[c1+'_'+str1+'_mean']=df2[col[0]].map(t2)
else:
df2[c1+'_'+str1+'_mean']=f(df2[list(col)])
if flag==1:
t2=dict(t1[c1].count())
if len(col)==1:
df2[c1+'_'+str1+'_count']=df2[col[0]].map(t2)
else:
df2[c1+'_'+str1+'_count']=np.array(f(df2[list(col)]))/2756003.0
return df2
#items = solveCategorical('revenue',train_items,items,1)
#items = solveCategorical('click',train_items,items,0)
#items = solveCategorical('basket',train_items,items,0)
#items = solveCategorical('order',train_items,items,0)
#items.to_csv('data/interim/items_v1')
In [20]:
items = pd.read_csv(data/interim/items_v1.csv')
In [21]:
items.info(max_cols=136)
In [22]:
items_pred=list(items.columns)
t1=['pid']
for p in items_pred:
if 'revenue' in p:
t1.append(p)
items_pred=t1
In [23]:
train_items=pd.merge(train_items,items[items_pred],on='pid')
In [24]:
predictors=[ #'lineID',
#'day',
#'pid',
'adFlag',
#'availability',
#'competitorPrice',
#'click',
#'basket',
#'order',
#'price',
#'revenue',
#'manufacturer',
#'group',
#'content',
#'unit',
#'pharmForm',
'genericProduct',
#'salesIndex',
#'category',
#'campaignIndex',
'rrp',
'availability_1',
'availability_2',
'availability_3',
'availability_4',
'unit_CM',
'unit_G',
'unit_KG',
'unit_L',
'unit_M',
'unit_ML',
'unit_P',
'unit_ST',
'salesIndex_40',
'salesIndex_44',
'salesIndex_52',
'salesIndex_53',
'campaignIndex_A',
'campaignIndex_B',
'campaignIndex_C',
'day_of_week',
'discount',
'compDiscount',
'revenue_group_mean',
'revenue_group_count',
'revenue_content_mean',
'revenue_content_count',
'revenue_pharmForm_mean',
'revenue_pharmForm_count',
'revenue_category_mean',
'revenue_category_count',
'revenue_manufacturer_mean',
'revenue_manufacturer_count',
'revenue_group_content_mean',
'revenue_group_content_count',
'revenue_group_pharmForm_mean',
'revenue_group_pharmForm_count',
'revenue_group_category_mean',
'revenue_group_category_count',
'revenue_group_manufacturer_mean',
'revenue_group_manufacturer_count',
'revenue_content_pharmForm_mean',
'revenue_content_pharmForm_count',
'revenue_content_category_mean',
'revenue_content_category_count',
'revenue_content_manufacturer_mean',
'revenue_content_manufacturer_count',
'revenue_pharmForm_category_mean',
'revenue_pharmForm_category_count',
'revenue_pharmForm_manufacturer_mean',
'revenue_pharmForm_manufacturer_count',
'revenue_category_manufacturer_mean',
'revenue_category_manufacturer_count',
'revenue_group_content_pharmForm_mean',
'revenue_group_content_pharmForm_count',
'revenue_group_content_category_mean',
'revenue_group_content_category_count',
'revenue_group_content_manufacturer_mean',
'revenue_group_content_manufacturer_count',
'revenue_group_pharmForm_category_mean',
'revenue_group_pharmForm_category_count',
'revenue_group_pharmForm_manufacturer_mean',
'revenue_group_pharmForm_manufacturer_count',
'revenue_group_category_manufacturer_mean',
'revenue_group_category_manufacturer_count',
'revenue_content_pharmForm_category_mean',
'revenue_content_pharmForm_category_count',
'revenue_content_pharmForm_manufacturer_mean',
'revenue_content_pharmForm_manufacturer_count',
'revenue_content_category_manufacturer_mean',
'revenue_content_category_manufacturer_count',
'revenue_pharmForm_category_manufacturer_mean',
'revenue_pharmForm_category_manufacturer_count']
In [25]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train_items, train_size=0.1, test_size=0, random_state=42)
In [26]:
y_train = train_set['revenue']
x_train = train_set[predictors]
del train_items, items, test_set
gc.collect
Out[26]:
In [27]:
x_train.head(1)
Out[27]:
In [28]:
scaler = MinMaxScaler()
x_train=scaler.fit_transform(x_train)
In [29]:
from sklearn.model_selection import cross_val_score
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model, x_train, y_train, scoring="neg_mean_squared_error", cv = 5))
return(rmse)
model_ridge = linear_model.Ridge()
alphas = [1, 4, 5, 6, 7, 8, 9, 10, 11, 15]
cv_ridge = [rmse_cv(linear_model.Ridge(alpha = alpha)).mean() for alpha in alphas]
In [30]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot()
plt.xlabel("alpha")
plt.ylabel("rmse")
Out[30]:
In [31]:
model_ridge = linear_model.Ridge(alpha=6, fit_intercept=True, max_iter=10000)
model_ridge.fit(x_train, y_train)
model_lasso = linear_model.LassoCV(alphas = [1, 0.16, 0.1, 0.001, 0.0005]).fit(x_train, y_train)
In [32]:
rmse_cv(model_lasso).mean()
Out[32]:
In [33]:
cv_ridge.min()
Out[33]:
In [34]:
c = pd.Series(model_lasso.coef_, index = train_set[predictors].columns)
print("Lasso picked " + str(sum(c != 0)) + " variables and eliminated the other " + str(sum(c == 0)) + " variables")
In [35]:
imp_coef = pd.concat([c.sort_values().head(10), c.sort_values().tail(10)])
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")
Out[35]:
In [36]:
c = pd.Series(model_ridge.coef_, index = train_set[predictors].columns)
imp_coef = pd.concat([c.sort_values().head(10), c.sort_values().tail(10)])
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")
Out[36]:
In [37]:
del train_set, x_train, y_train
gc.collect
Out[37]:
In [38]:
items = pd.read_csv('data/raw/items.csv',sep='|')
train = pd.read_csv('data/raw/train.csv',sep='|')
train_items=pd.merge(train,items,on='pid')
del train
gc.collect
Out[38]:
In [39]:
train_items=toCategorical(train_items)
train_items=solveNA(train_items,train_items,coef_competitorPrice_to_rrp,0)
train_items=Dummies(train_items)
train_items=moreFeautures(train_items)
items = pd.read_csv('data/interim/items_v1.csv')
train_items=pd.merge(train_items,items[items_pred],on='pid')
y_train = train_items['revenue']
x_train = train_items[predictors]
del train_items, items
gc.collect
Out[39]:
In [41]:
scaler = MinMaxScaler()
x_train=scaler.fit_transform(x_train)
In [42]:
model_lasso = linear_model.LassoCV(alphas = [1, 0.16, 0.1, 0.001, 0.0005]).fit(x_train, y_train)
In [43]:
model_ridge = linear_model.Ridge(alpha=6, fit_intercept=True, max_iter=10000)
model_ridge.fit(x_train, y_train)
Out[43]:
In [44]:
del y_train,x_train
gc.collect
Out[44]:
In [45]:
items = pd.read_csv(
'data/raw/items.csv',sep='|')
clas = pd.read_csv(
'data/raw/class.csv',sep='|')
clas_items=pd.merge(clas,items,on='pid')
clas_items=toCategorical(clas_items)
clas_items=solveNA(clas_items,clas_items,coef_competitorPrice_to_rrp,0)
clas_items=Dummies(clas_items)
clas_items=moreFeautures(clas_items)
items = pd.read_csv(
'data/interim/items_v1.csv')
clas_items=pd.merge(clas_items,items[items_pred],on='pid')
submission = pd.DataFrame({
"lineID": clas_items["lineID"],
"revenue": np.zeros(shape=(1210767,))
})
x_test = clas_items[predictors]
del clas_items,clas,items
gc.collect
Out[45]:
In [46]:
x_test = x_test.fillna(x_test.mean())
x_test = scaler.fit_transform(x_test)
In [47]:
lasso_preds = model_lasso.predict(x_test)
ridge_preds = model_ridge.predict(x_test)
In [49]:
predictions = pd.DataFrame({"ridge":ridge_preds, "lasso":lasso_preds})
predictions[predictions['ridge']<0]=0
predictions[predictions['lasso']<0]=0
lasso_preds = predictions['lasso']
ridge_preds = predictions['ridge']
com_pred = (lasso_preds + ridge_preds) / 2.0
submission['revenue']=com_pred
submission_sorted=submission.sort_values("lineID");
submission_sorted.to_csv("data/Uni_Polytechnic_Lviv_1.csv", index=False, sep='|')
In [50]:
submission_sorted
Out[50]: