In [59]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings("ignore")
%matplotlib inline
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (12, 8)
In [2]:
fl1 = "data_files/train.csv"
fl2 = "data_files/test.csv"
In [3]:
train_df = pd.read_csv(fl1, header=0)
test_df = pd.read_csv(fl2, header=0)
In [4]:
print(train_df.shape)
print(test_df.shape)
In [61]:
tr_nulls = train_df.isnull().sum().to_frame().transpose()
te_nulls = test_df.isnull().sum().to_frame().transpose()
In [65]:
# for c in te_nulls.columns:
# print("{}: {}".format(c, tr_nulls[c][0]))
In [5]:
train_df.corr()
Out[5]:
In [6]:
# scatter_matrix(train_df, alpha=0.2, diagonal='kde')
In [7]:
print(train_df['loss'].min())
print(train_df['loss'].max())
In [8]:
train_df['loss'].plot.hist(bins=100, color="darkred")
Out[8]:
In [9]:
np.log(train_df['loss']).plot.hist(bins=100, color="darkred")
Out[9]:
In [10]:
train_df2 = train_df.copy()
In [11]:
train_df2['log_loss'] = train_df['loss'].map(lambda x: np.log(x))
In [12]:
train_df2.corr()
Out[12]:
In [13]:
cont_vars = ["cont{}".format(i) for i in range(1, 15)]
In [14]:
X_cl = np.asarray(train_df2[cont_vars])
In [17]:
km = KMeans(n_clusters=7)
In [18]:
cont_labels = km.fit_predict(X_cl)
In [19]:
pca = PCA()
In [20]:
pca.fit(X_cl)
Out[20]:
In [22]:
plt.figure(1)
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
Out[22]:
In [25]:
cat_vars = ["cat{}".format(i) for i in range(1, 117)]
In [26]:
train_df[cat_vars].head()
Out[26]:
In [60]:
# for c in cat_vars:
# print("{}: {}".format(c, len(train_df[c].unique())))
In [29]:
train_df.shape
Out[29]:
In [30]:
num_cols_w_dums = 131
for c in cat_vars:
num_cols_w_dums += len(train_df[c].unique()) - 1
In [31]:
num_cols_w_dums
Out[31]:
In [32]:
train_df3 = train_df.copy()
In [33]:
train_df3 = pd.get_dummies(train_df3, columns=cat_vars, drop_first=True)
In [34]:
train_df3.shape
Out[34]:
In [35]:
train_df3.head()
Out[35]:
In [36]:
pca2 = PCA()
In [37]:
pca2.fit(np.asarray(train_df3.drop(['id', 'loss'], axis=1)))
Out[37]:
In [39]:
plt.figure(1)
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca2.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
Out[39]:
In [40]:
pca_200_comps = PCA(n_components=200)
In [41]:
X_pca_200 = pca_200_comps.fit_transform(np.asarray(train_df3.drop(['id', 'loss'], axis=1)))
In [42]:
X_pca_200.shape
Out[42]:
In [44]:
y_pca_200 = np.asarray(train_df3['loss']).reshape(-1, 1)
In [46]:
lm1 = LinearRegression()
In [47]:
lm1.fit(X_pca_200, y_pca_200)
Out[47]:
In [48]:
lm1.score(X_pca_200, y_pca_200)
Out[48]:
In [51]:
ls = Lasso()
In [53]:
cross_val_score(ls, np.asarray(train_df3.drop(['id', 'loss'], axis=1)), np.asarray(train_df3['loss']).reshape(-1, 1),
scoring="neg_mean_absolute_error")
Out[53]:
In [56]:
rf = RandomForestRegressor(n_estimators=20, max_depth=10)
In [57]:
# cross_val_score(rf, np.asarray(train_df3.drop(['id', 'loss'], axis=1)), np.asarray(train_df3['loss']).reshape(-1, 1),
# scoring="neg_mean_absolute_error")
Out[57]:
In [66]:
l1_mixes = np.arange(0.1, 1.0, 0.1)
In [68]:
mx_dict = dict()
for l in l1_mixes:
en = ElasticNet(alpha=0.1, l1_ratio=l)
scs = cross_val_score(en, np.asarray(train_df3.drop(['id', 'loss'], axis=1)),
np.asarray(train_df3['loss']).reshape(-1, 1), scoring="neg_mean_absolute_error")
mx_dict[str(l)] = np.absolute(np.mean(scs))
In [69]:
mx_dict
Out[69]:
In [ ]: