In [1]:
%config InlineBackend.figure_format = 'png'
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy import stats
import pylab
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
In [3]:
train = pd.read_csv("treino.csv")
data_columns_names = ['matricula', 'vetorial','lpt','p1','ic','lp1','calculo2','discreta','p2','grafos','fis_classica','lp2','cra','calculo1']
train.columns = data_columns_names
In [4]:
train.head()
Out[4]:
In [5]:
matricula = train["matricula"]
features = train.drop(["matricula"], axis=1)
To search for missing values we first apply a function that checks if each value of our data frame is a real number and return true for those who are and false otherwise. In a second moment, we count how Trues and Falses we obtained in the first function. The below table present the numbers and confirm that all values for all features are presented in our data, thus, it is not needed to treat missing values.
In [6]:
features.applymap(np.isreal).apply(pd.value_counts)
Out[6]:
In [7]:
features.apply(lambda x: stats.shapiro(x))
Out[7]:
In [8]:
numeric_feats = features.dtypes[features.dtypes != "object"].index
skewness = features[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
print skewness
In [9]:
def show_qqplot(x, data):
fig = plt.figure()
ax = fig.add_subplot(111)
stats.probplot(data[x], dist="norm", plot=pylab)
ax.set_title(x)
pylab.show()
for name in features.columns:
show_qqplot(name, features)
Through our analysis, we conclude that the only feature in the necessity of being transformed is LP2. We conclude it by:
1) Checking for normality in data - by knowing that normal data necessarily are not skewed - and discarding a transformation in IC, Grafos and CRA;
2) Calculating the skewness values for the features; and
3) Verifying at the QQ-Plot for each feature.
Now we ease the lp2's skew using exp2 function and show its new values and all metrics previously mentioned.
In [10]:
skewed_feats = skewness[skewness < -0.75]
skewed_feats = skewed_feats.index
features[skewed_feats] = np.exp2(features[skewed_feats])
In [11]:
features.head()
Out[11]:
In [12]:
numeric_feats = features.dtypes[features.dtypes != "object"].index
print features[numeric_feats].apply(lambda x: skew(x.dropna()))
show_qqplot("lp2", features)
In [13]:
y = features.cra
X_train = features.drop(["cra"], axis=1)
In [14]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, Lasso, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
def vector_norm(w):
return np.sqrt(np.sum(w**2))
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 10))
return(rmse)
def coefficients_graphic(model, title):
coef = pd.Series(model.coef_, index = X_train.columns)
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
coef.plot(kind = "barh")
plt.title(title)
def residuals_graph(model):
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")
def cv_rmse_graph(cv_rmse, alpha_levels):
cv_rmse = pd.Series(cv_rmse, index = alpha_levels)
cv_rmse.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")
In [15]:
clf = Ridge(alpha=0)
clf.fit(X_train, y)
vector_norm(clf.coef_)
Out[15]:
In [16]:
coefficients_graphic(clf, "Coefficients in the Ridge Model Not Regularized")
residuals_graph(clf)
In [17]:
alphas = [0, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 40, 50, 75]
cv_rmse = [rmse_cv(Ridge(alpha = level)).mean()
for level in alphas]
cv_rmse_graph(cv_rmse, alphas)
In [18]:
clf = Ridge(alpha=40)
clf.fit(X_train, y)
vector_norm(clf.coef_)
Out[18]:
In [19]:
coefficients_graphic(clf, "Coefficients in Regularized Ridge Model")
residuals_graph(clf)
In [20]:
model_lasso = Lasso(alpha = [0]).fit(X_train, y)
In [21]:
coef_lasso = pd.Series(model_lasso.coef_, index = X_train.columns)
In [22]:
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " + str(sum(coef_lasso == 0)) + " variables")
In [23]:
coefficients_graphic(model_lasso, "Coefficients in the Lasso Model Not Regularized")
residuals_graph(model_lasso)
In [24]:
alphas = [0, 0.001, 0.01, 0.02, 0.03, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
cv_rmse = [rmse_cv(Lasso(alpha = level)).mean()
for level in alphas]
cv_rmse_graph(cv_rmse, alphas)
In [25]:
model_lasso = Lasso(alpha = 0.06).fit(X_train, y)
In [26]:
coef_lasso = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " + str(sum(coef_lasso == 0)) + " variables")
In [27]:
print coef_lasso
coefficients_graphic(model_lasso, "Coefficients in Regularized Lasso Model")
residuals_graph(model_lasso)
In [28]:
from sklearn.neighbors import KNeighborsRegressor
max_n_neighbors = int(y.shape[0] - 0.1*y.shape[0])
neighbors = range(1,max_n_neighbors)
cv_rmse = [rmse_cv(KNeighborsRegressor(n_neighbors = level)).mean()
for level in neighbors]
cv_rmse_graph(cv_rmse, neighbors)
In [29]:
best_knn_fit = KNeighborsRegressor(n_neighbors = 22)
residuals_graph(model_lasso)
To chose an adequate value of the hyperparameters we used 10-fold cross-validation and verified the one which minimizes the rmse - the chosen values are shown below. Regarding to residuals x prediction plots, is possible to observe that we can't easily identify a pattern of points distribution, then, we conclude the models are well fitted.
Ridge: alpha = 40
LASSO: alpha = 0.06
KNN: neighbors = 22
In [34]:
teste = pd.read_csv("graduados_teste.csv")