In [147]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import Ridge
from math import sqrt
from sklearn.tree import DecisionTreeRegressor
import scipy.stats as sp
In [145]:
np.set_printoptions(precision=2,suppress=True)
In [27]:
pd.__version__
Out[27]:
In [28]:
root = "/home/felipe/python-sandbox/python3/notebooks/test/"
In [25]:
df_agrup_q1 = pd.read_csv(root+"agrupamento_Q1.csv")
df_agrup_q1.head()
Out[25]:
In [38]:
X = df_agrup_q1.values
X.shape
Out[38]:
In [41]:
df_agrup_centroides_q1 = pd.read_csv(root+"agrup_centroides_Q1.csv")
df_agrup_centroides_q1.head()
Out[41]:
In [63]:
centroids = df_agrup_centroides_q1.values[:,1:]
centroids.shape
Out[63]:
In [65]:
kmeans = KMeans(n_clusters=5, random_state=42, init=centroids[:5,:],max_iter=10,n_init=1).fit(X)
print(kmeans.cluster_centers_)
In [66]:
plt.clf()
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
# k means determine k
distortions = []
K = range(1,12)
for k in K:
kmeanModel = KMeans(n_clusters=k, random_state=42, init=centroids[:k,:],max_iter=10,n_init=1).fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.xticks(np.arange(1,12,1))
plt.show()
In [31]:
df_classif_q3 = pd.read_csv(root+"classificacao_Q3.csv")
df_classif_q3.head()
Out[31]:
In [67]:
df_classif_q3['Genero'].astype('category').describe()
Out[67]:
In [68]:
df_classif_q3['Idade'].astype('category').describe()
Out[68]:
In [69]:
df_classif_q3['Escolaridade'].astype('category').describe()
Out[69]:
In [70]:
df_classif_q3['Profissao'].astype('category').describe()
Out[70]:
In [71]:
df_classif_q3_ohe = df_classif_q3.copy()
In [72]:
df_classif_q3_ohe = pd.concat([
df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Genero'], prefix='Genero')
],axis=1).drop(['Genero'],axis=1)
In [73]:
df_classif_q3_ohe = pd.concat([
df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Idade'], prefix='Idade')
],axis=1).drop(['Idade'],axis=1)
In [74]:
df_classif_q3_ohe = pd.concat([
df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Escolaridade'], prefix='Escolaridade')
],axis=1).drop(['Escolaridade'],axis=1)
In [75]:
df_classif_q3_ohe = pd.concat([
df_classif_q3_ohe,pd.get_dummies(df_classif_q3_ohe['Profissao'], prefix='Profissao')
],axis=1).drop(['Profissao'],axis=1)
In [76]:
df_classif_q3_ohe.head()
Out[76]:
In [95]:
df_classif_q3_ohe['Target'].mean()
Out[95]:
In [80]:
len(df_classif_q3_ohe)
Out[80]:
In [83]:
X_train = df_classif_q3_ohe.drop(['Target'],axis=1).values[:500,:]
X_test = df_classif_q3_ohe.drop(['Target'],axis=1).values[500:,:]
y_train = df_classif_q3_ohe[['Target']].values[:500,:]
y_test = df_classif_q3_ohe[['Target']].values[500:,:]
X_train.shape,X_test.shape,y_train.shape,y_test.shape
Out[83]:
In [85]:
clf = GaussianNB()
clf.fit(X_train,y_train.ravel())
Out[85]:
In [93]:
preds_train = clf.predict(X_train)
metrics.accuracy_score(y_train,preds_train)
Out[93]:
In [94]:
preds_test = clf.predict(X_test)
metrics.accuracy_score(y_test,preds_test)
Out[94]:
In [32]:
df_classif_q4 = pd.read_csv(root+"classificacao_Q4.csv")
df_classif_q4.head()
Out[32]:
In [107]:
len(df_classif_q4)
Out[107]:
In [101]:
data = df_classif_q4.drop(['target'],axis=1).values
target = df_classif_q4['target'].values
In [102]:
In [140]:
accs=[]
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(data):
# print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = data[train_index], data[test_index]
y_train, y_test = target[train_index], target[test_index]
knn = KNeighborsClassifier(n_neighbors=15,metric='euclidean')
knn.fit(X_train,y_train)
preds = knn.predict(X_test)
acc = metrics.accuracy_score(y_test,preds)
accs.append(acc)
print(acc)
np.mean(accs)
Out[140]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [33]:
df_regr_q6 = pd.read_csv(root+"regressao_Q6.csv")
df_regr_q6.head()
Out[33]:
In [115]:
len(df_regr_q6)
Out[115]:
In [137]:
X = df_regr_q6.drop(['target'],axis=1).values
y = df_regr_q6['target'].values
X[0],y[0]
Out[137]:
In [138]:
loo = LeaveOneOut()
test_errs = []
train_errs = []
for train_index, test_index in loo.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = Ridge(alpha=1.7)
clf.fit(X_train,y_train)
preds_test = clf.predict(X_test)
preds_train = clf.predict(X_train)
test_errs.append(sqrt(metrics.mean_squared_error(y_test,preds_test)))
train_errs.append(sqrt(metrics.mean_squared_error(y_train,preds_train)))
print(np.mean(test_errs))
print(np.mean(train_errs))
# print(X_train, X_test, y_train, y_test)
In [34]:
df_regr_q7 = pd.read_csv(root+"regressao_Q7.csv")
df_regr_q7.head()
Out[34]:
In [141]:
X = df_regr_q7.drop(['target'],axis=1).values
y = df_regr_q7['target'].values
In [146]:
kf = KFold(n_splits=10)
accs_train = []
accs_test = []
for train_index, test_index in kf.split(data):
# print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
clf = DecisionTreeRegressor()
clf.fit(X_train,y_train)
preds_train = clf.predict(X_train)
preds_test = clf.predict(X_test)
accs_train.append(metrics.mean_absolute_error(y_train,preds_train))
accs_test.append(metrics.mean_absolute_error(y_test,preds_test))
np.mean(accs_train),np.mean(accs_test)
Out[146]:
In [149]:
np.random.seed(42)
sp.ks_2samp([5,3,3,11,8,7,1,5,4,9],[2,1,1,4,10,1,1,1,3,2])
Out[149]:
In [151]:
p_bad = 0.15
p_right_given_bad = 0.9
p_right_given_good = 0.95
p_right_given_bad*p_bad + p_right_given_good*(1-p_bad)
Out[151]:
In [153]:
0.9*0.15+0.95*0.85
Out[153]: