In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
In [11]:
DATAFILE = os.path.join('data','data.csv')
TARGETFILE = os.path.join('data','target.csv')
OUTDIR = os.path.join('results')
In [45]:
#DA AGGIUNGERE GTARGET E TDI
train = pd.read_table(DATAFILE,sep=',')
target = pd.read_table(TARGETFILE,sep=',')
dataset = pd.concat([train,target],axis=1)
In [46]:
#Original dataset size
original_size = np.shape(dataset)[0]
#Delete rows associated with "extreme" values: Y=200 || Y = -200
dataset = dataset[dataset.Y!=-200]
dataset = dataset[dataset.Y!=200]
#New dataset size and analysis of lost data
new_size = np.shape(dataset)[0]
print('Records lost: ',(1.0-new_size/original_size)*100, '%')
#Description of the dataset
dataset.describe()
Out[46]:
In [47]:
dataset['Y'].describe()
Out[47]:
In [49]:
sns.distplot(dataset['Y'])
plt.show()
In [52]:
#correlation matrix
corrmat = dataset.corr()
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
In [51]:
#Target correlation matrix
k = 5 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Y')['Y'].index
cm = np.corrcoef(dataset[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
In [21]:
#scatterplot
sns.set()
sns.pairplot(dataset, size = 2.5)
plt.show();
In [19]:
dataset_trans = dataset.copy()
In [20]:
#min_risk
var = 'GT'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
scaler = MinMaxScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
In [12]:
#min_risk
var = 'min_risk'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [13]:
#CR
var = 'CR'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [14]:
#CF
var = 'CF'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [15]:
#u2ss
var = 'u2ss'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [16]:
#Vmx
var = 'Vmx'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [17]:
#kp3
var = 'kp3'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [18]:
#Gt
var = 'Gt'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [19]:
#Ib
var = 'Ib'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [20]:
#BW
var = 'BW'
sns.distplot(dataset[var])
plt.show()
res = stats.probplot(dataset[var], plot=plt)
plt.show()
#log-transformation and standard scaling
dataset_trans[var] = np.log(dataset[var])
scaler = StandardScaler().fit(dataset_trans[var])
dataset_trans[var] = scaler.transform(dataset_trans[var])
sns.distplot(dataset_trans[var])
plt.show()
res = stats.probplot(dataset_trans[var], plot=plt)
plt.show()
In [21]:
#ROC
var = 'ROC'
enc = LabelEncoder()
a = enc.fit_transform(np.asarray(dataset_trans[var]))
n = np.shape(enc.classes_)[0]
b = np.zeros([np.shape(a)[0],n])
for i in np.arange(np.shape(a)[0]):
b[i,a[i]] = 1
columns = ["" for x in range(n)]
for i in np.arange(n):
columns[i] = var + str(enc.classes_[i])
d = pd.DataFrame(data=b,columns=columns)
del dataset_trans[var]
dataset_trans = pd.concat([d, dataset_trans],axis=1)
In [22]:
#var_class
var = 'var_class'
enc = LabelEncoder()
a = enc.fit_transform(np.asarray(dataset_trans[var]))
n = np.shape(enc.classes_)[0]
b = np.zeros([np.shape(a)[0],n])
for i in np.arange(np.shape(a)[0]):
b[i,a[i]] = 1
columns = ["" for x in range(n)]
for i in np.arange(n):
columns[i] = var + str(enc.classes_[i])
d = pd.DataFrame(data=b,columns=columns)
del dataset_trans[var]
dataset_trans = pd.concat([d, dataset_trans],axis=1)
In [23]:
#scatterplot
sns.set()
sns.pairplot(dataset_trans, size = 2.5)
plt.show();
In [24]:
#correlation matrix
corrmat = dataset_trans.corr()
sns.heatmap(corrmat, vmax=.8, square=True)
plt.show()
In [26]:
#Target correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Y')['Y'].index
cm = np.corrcoef(dataset_trans[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
In [30]:
dataset_trans
Out[30]:
In [31]:
dataset_trans.to_csv('dataset_trans.csv',sep=',',index=False)
In [ ]: