In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy.sparse
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
In [ ]:
!cat ../input/train.tsv | head -n 5
In [ ]:
df_test = pd.read_csv("../input/test.tsv", sep="\t")
df_test.head()
In [ ]:
df_train = pd.read_csv("../input/train.tsv", sep="\t")
df_train.head()
In [ ]:
df_train.info()
In [ ]:
brand_vect = CountVectorizer()
def get_brands(df):
return df["brand_name"].fillna("NotABrand")
X_train = brand_vect.fit_transform(get_brands(df_train))
In [ ]:
X_train.shape # about 5k brands
In [ ]:
name_vect = TfidfVectorizer(max_features=10000, stop_words="english")
name_vect.fit(df_train["name"])
In [ ]:
descr_vect = TfidfVectorizer(max_features=10000, stop_words="english")
descr_vect.fit(df_train["item_description"].fillna("No description yet"))
|
In [ ]:
def get_features(df):
X = brand_vect.transform(get_brands(df))
shipping = df["shipping"].values.reshape((len(df), 1))
condition = df["item_condition_id"].values.reshape((len(df), 1))
names = name_vect.transform(df["name"])
descr = descr_vect.transform(df["item_description"].fillna("No description yet"))
X = scipy.sparse.hstack([X, shipping, condition, names, descr])
return X
In [ ]:
X_train = get_features(df_train)
X_train.shape
In [ ]:
y_train = np.log(1. + df_train["price"].values) # Root Mean Squared Logarithmic Error
y_train
In [ ]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
In [ ]:
%%time
LR = Ridge()
res = cross_val_score(LR, X = X_train, y=y_train, cv=3, scoring="neg_mean_squared_error")
In [ ]:
((-res) ** 0.5).mean() # ((-res) ** 0.5).mean() # 0.49722430553049496
In [ ]:
!cat ../input/sample_submission.csv |head -n5
In [ ]:
%%time
X_test = get_features(df_test)
In [ ]:
X_test.shape, X_train.shape
In [ ]:
%%time
LR = Ridge()
LR.fit(X_train, y_train)
In [ ]:
%%time
prediction = LR.predict(X_test)
In [ ]:
df = pd.DataFrame()
df["test_id"] = df_test["test_id"].values
df["price"] = np.exp(prediction) - 1.
df.head()
In [ ]:
df.to_csv("predction.csv", index=False)
In [ ]:
!cat predction.csv |head -n5
In [ ]:
!cat ../input/sample_submission.csv |head -n5
In [ ]: