In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import scipy.sparse

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [ ]:
!cat ../input/train.tsv | head -n 5

In [ ]:
df_test = pd.read_csv("../input/test.tsv", sep="\t")
df_test.head()

In [ ]:
df_train = pd.read_csv("../input/train.tsv", sep="\t")
df_train.head()

In [ ]:
df_train.info()

In [ ]:
brand_vect = CountVectorizer()
def get_brands(df):
    return df["brand_name"].fillna("NotABrand")
X_train = brand_vect.fit_transform(get_brands(df_train))

In [ ]:
X_train.shape  # about 5k brands

In [ ]:
name_vect = TfidfVectorizer(max_features=10000, stop_words="english")
name_vect.fit(df_train["name"])

In [ ]:
descr_vect = TfidfVectorizer(max_features=10000, stop_words="english")
descr_vect.fit(df_train["item_description"].fillna("No description yet"))

|


In [ ]:
def get_features(df):
    X = brand_vect.transform(get_brands(df))
    shipping = df["shipping"].values.reshape((len(df), 1))
    condition = df["item_condition_id"].values.reshape((len(df), 1))
    names = name_vect.transform(df["name"])
    descr = descr_vect.transform(df["item_description"].fillna("No description yet"))
    X = scipy.sparse.hstack([X, shipping, condition, names, descr])
    return X

In [ ]:
X_train = get_features(df_train)
X_train.shape

In [ ]:
y_train = np.log(1. + df_train["price"].values)  # Root Mean Squared Logarithmic Error
y_train

In [ ]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

In [ ]:
%%time
LR = Ridge()
res = cross_val_score(LR, X = X_train, y=y_train, cv=3, scoring="neg_mean_squared_error")

In [ ]:
((-res) ** 0.5).mean()  # ((-res) ** 0.5).mean()  # 0.49722430553049496


In [ ]:
!cat ../input/sample_submission.csv |head -n5

In [ ]:
%%time
X_test = get_features(df_test)

In [ ]:
X_test.shape, X_train.shape

In [ ]:
%%time
LR = Ridge()
LR.fit(X_train, y_train)

In [ ]:
%%time
prediction = LR.predict(X_test)

In [ ]:
df = pd.DataFrame()
df["test_id"] = df_test["test_id"].values
df["price"] = np.exp(prediction) - 1.
df.head()

In [ ]:
df.to_csv("predction.csv", index=False)

In [ ]:
!cat predction.csv |head -n5

In [ ]:
!cat ../input/sample_submission.csv |head -n5

In [ ]: