In [1]:
import numpy as np
import pandas as pd 
import xgboost as xgb

from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")

In [3]:
for c in train.columns:
    if train[c].dtype == "object":
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [4]:
n_comp = 10

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)


/Users/datitran/anaconda/envs/kaggle/lib/python3.5/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)

In [ ]:
"""
pca_train_df = pd.DataFrame(pca2_results_train).add_prefix("pca_")
ica_train_df = pd.DataFrame(ica2_results_train).add_prefix("ica_")

x_train = pd.concat([pca_train_df, ica_train_df], axis=1)

pca_test_df = pd.DataFrame(pca2_results_test).add_prefix("pca_")
ica_test_df = pd.DataFrame(ica2_results_test).add_prefix("ica_")

x_test = pd.concat([pca_test_df, ica_test_df], axis=1)

"""

In [5]:
for i in range(1, n_comp+1):
    train["pca_" + str(i)] = pca2_results_train[:,i-1]
    test["pca_" + str(i)] = pca2_results_test[:, i-1]
    
    train["ica_" + str(i)] = ica2_results_train[:,i-1]
    test["ica_" + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

In [6]:
dtrain = xgb.DMatrix(train.drop("y", axis=1), y_train)
dtest = xgb.DMatrix(test)

In [7]:
startTime = datetime.now()

In [10]:
r2_list = []

seed_nr = 10

for i in range(seed_nr):
    xgb_params = {
        "n_trees": 500, 
        "eta": 0.005,
        "max_depth": 4,
        "subsample": 0.95,
        "objective": "reg:linear",
        "eval_metric": "rmse",
        "base_score": y_mean,
        "silent": 1,
        "seed": i
    }
    
    #cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=2000, early_stopping_rounds=50, 
    #               verbose_eval=200, show_stdv=False)
    
    #num_boost_rounds = len(cv_output) # old 900

    num_boost_rounds = 900
    model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

    r2_value = r2_score(model.predict(dtrain), dtrain.get_label())

    y_pred = model.predict(dtest)

    output = pd.DataFrame({"id": test.index, "y": y_pred})

    output.to_csv("xgb_various_seeds/submission_{seed}_{r2}.csv".format(seed=i, r2=r2_value), index=False)
    r2_list.append((i, r2_value))

In [ ]:
print(datetime.now() - startTime)

In [ ]: