Version 2 main highlights:
Version 2 changes:
In [1]:
%matplotlib notebook
from sciblox2 import *
In [2]:
x = read("train.csv")
analyse(x, y = "Survived")
Out[2]:
In [3]:
varcheck(x)
Out[3]:
In [4]:
varcheck(x, limit = False)
Out[4]:
In [40]:
analyse(x, y = "Survived", limit = False, graph = False)
Out[40]:
In [9]:
O = outlier(x, indicate = True)
In [10]:
plot(x = "Age", y = "Fare", z = "Pclass", data = O, hue = "IsOutlier")
In [41]:
x = notoutlier(x)
In [15]:
x_train, x_test, y_train, y_test, processor = preprocess(x, target = "Survived")
In [30]:
x_train, x_test, y_train, y_test, processor = preprocess(x, target = "Survived",
hold = 0.1, impute = "mice",
mice = "boost", scale = "robust",
dummy = True, norm = False)
In [34]:
modelrf = randomforest(x_train, x_test, y_train, y_test)
modellg = lightgbm(x_train, x_test, y_train, y_test)
In [32]:
test = read("test.csv")
test = prefit(test, processor)
sample(test)
Out[32]:
In [35]:
predict(test = test, model = modellg, processor = processor)
Out[35]:
In [42]:
plot(x = "Age", y = "Fare", data = x)
In [43]:
plot(x = "Age", y = "Embarked", data = x)
In [45]:
plot(x = "Age", y = "Embarked", data = x, hue = "Fare")
In [46]:
plot(x = "Age", data = x)