sciblox v2 (Daniel Han-Chen)

Version 2 main highlights:

NEW: Machine Learning Modules (LightGBM + Random Forests supported as of now)
NEW: Feature importance analyser
NEW: Advanced data preprocessing and analysis

Version 2 changes:

Fixed errors when hcat or vcat'ing
Faster converging BPCA impute
Added machine learning modules
Added new data analysis methods - unskew, remove outliers
Better data analyser methods
Streamlined package
AUTO Package installs

Use "notebook" for plot customisations. "inline" for static plot



In [1]:

    
%matplotlib notebook
from sciblox2 import *

Showcase:



In [2]:

    
x = read("train.csv")
analyse(x, y = "Survived")



In [3]:

    
varcheck(x)









    Out[3]:




  
 
     
         
        FreqRatio 
        BadFreq? 
        %Unique 
        BadUnq? 
        Var 
        BadVar? 
        BAD? 
     
     
        Parch 
        5.75 
        1 
        0.79 
        0 
        0.65 
        0 
        True 
    
     
        Embarked 
        3.83 
        1 
        0.34 
        0 
        nan 
        0 
        True



In [4]:

    
varcheck(x, limit = False)









    Out[4]:




  
 
     
         
        FreqRatio 
        BadFreq? 
        %Unique 
        BadUnq? 
        Var 
        BadVar? 
        BAD? 
     
     
        Parch 
        5.75 
        1 
        0.79 
        0 
        0.65 
        0 
        True 
    
     
        Embarked 
        3.83 
        1 
        0.34 
        0 
        nan 
        0 
        True 
    
     
        SibSp 
        2.91 
        0 
        0.79 
        0 
        1.22 
        0 
        False 
    
     
        Pclass 
        2.27 
        0 
        0.34 
        0 
        0.7 
        0 
        False 
    
     
        Sex 
        1.84 
        0 
        0.22 
        0 
        nan 
        0 
        False 
    
     
        Survived 
        1.61 
        0 
        0.22 
        0 
        0.24 
        0 
        False 
    
     
        Age 
        1.11 
        0 
        9.88 
        0 
        211.02 
        0 
        False 
    
     
        Fare 
        1.02 
        0 
        27.83 
        0 
        2469.44 
        0 
        False 
    
     
        PassengerId 
        1 
        0 
        100 
        0 
        66231 
        0 
        False 
    
     
        Name 
        1 
        0 
        100 
        0 
        nan 
        0 
        False 
    
     
        Ticket 
        1 
        0 
        76.43 
        0 
        nan 
        0 
        False 
    
     
        Cabin 
        1 
        0 
        16.5 
        0 
        nan 
        0 
        False



In [40]:

    
analyse(x, y = "Survived", limit = False, graph = False)









    Out[40]:




  
 
     
         
        Importance 
        Mean 
        Median 
        Range 
        FreqRatio 
        Mode 
        %Unique 
        Skewness 
     
     
        Age 
        27.86 
        29.7 
        28 
        79.58 
        1.11 
        24 
        9.88 
        0.39 
    
     
        Fare 
        25.71 
        32.2 
        14.45 
        512.33 
        1.02 
        8.05 
        27.83 
        4.79 
    
     
        Pclass 
        17.14 
        2.31 
        3 
        2 
        2.27 
        3 
        0.34 
        -0.63 
    
     
        Sex 
        15.71 
        nan 
        nan 
        nan 
        1.84 
        male 
        0.22 
        nan 
    
     
        SibSp 
        8.57 
        0.52 
        0 
        8 
        2.91 
        0 
        0.79 
        3.7 
    
     
        Embarked 
        5 
        nan 
        nan 
        nan 
        3.83 
        S 
        0.34 
        nan 
    
     
        Parch 
        0 
        0.38 
        0 
        6 
        5.75 
        0 
        0.79 
        2.75 
    
     
        Name 
        0 
        nan 
        nan 
        nan 
        1 
        Abbing, Mr. Anthony 
        100 
        nan 
    
     
        PassengerId 
        0 
        446 
        446 
        890 
        1 
        1 
        100 
        0 
    
     
        Ticket 
        0 
        nan 
        nan 
        nan 
        1 
        1601 
        76.43 
        nan 
    
     
        Cabin 
        0 
        nan 
        nan 
        nan 
        1 
        B96 B98 
        16.5 
        nan



In [9]:

    
O = outlier(x, indicate = True)









    



No. outliers = 23



In [10]:

    
plot(x = "Age", y = "Fare", z = "Pclass", data = O, hue = "IsOutlier")



In [41]:

    
x = notoutlier(x)









    



No. outliers = 23



In [15]:

    
x_train, x_test, y_train, y_test, processor = preprocess(x, target = "Survived")









    



Please wait...
Done
--------------------------------
x_train, x_test, y_train, y_test
Processing finished :)



In [30]:

    
x_train, x_test, y_train, y_test, processor = preprocess(x, target = "Survived",
                                                        hold = 0.1, impute = "mice",
                                                        mice = "boost", scale = "robust",
                                                        dummy = True, norm = False)









    



Please wait...
Done
--------------------------------
x_train, x_test, y_train, y_test
Processing finished :)



In [34]:

    
modelrf = randomforest(x_train, x_test, y_train, y_test)
modellg = lightgbm(x_train, x_test, y_train, y_test)









    



--------------------------------
RandomForest: Training...
Finished
Score = kappa = 0.575549993413
Score = rmse = 0.448461055651
Score = OOB = 0.785112359551
--------------------------------
LightGBM: Training...
Finished
Score = kappa = 0.593153798471
Score = rmse = 0.429368771453



In [32]:

    
test = read("test.csv")
test = prefit(test, processor)
sample(test)









    



Please wait...
Done






    Out[32]:







  
    
      
      Pclass
      Age
      SibSp
      Parch
      Fare
      Pclass_1.0
      Pclass_2.0
      Pclass_3.0
      Sex_female
      Sex_male
      ...
      Parch_0.0
      Parch_1.0
      Parch_2.0
      Parch_3.0
      Parch_4.0
      Parch_5.0
      Parch_6.0
      Embarked_C
      Embarked_Q
      Embarked_S
    
  
  
    
      204
      -1.0
      -0.205882
      0.0
      0.0
      -0.171255
      0.0
      1.0
      -1.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      85
      0.0
      -0.759688
      1.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      -1.0
    
    
      394
      0.0
      0.029412
      3.0
      1.0
      0.327888
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      -1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      395
      -2.0
      -0.617647
      1.0
      0.0
      1.972568
      1.0
      0.0
      -1.0
      1.0
      -1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      213
      -1.0
      1.852941
      1.0
      0.0
      0.500043
      0.0
      1.0
      -1.0
      1.0
      -1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 27 columns



In [35]:

    
predict(test = test, model = modellg, processor = processor)









    Out[35]:





array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0], dtype=int64)



In [42]:

    
plot(x = "Age", y = "Fare", data = x)



In [43]:

    
plot(x = "Age", y = "Embarked", data = x)



In [45]:

    
plot(x = "Age", y = "Embarked", data = x, hue = "Fare")



In [46]:

    
plot(x = "Age", data = x)

	Importance	Mean	Median	Range	FreqRatio	Mode	%Unique	Skewness
Age	27.86	29.7	28	79.58	1.11	24	9.88	0.39
Fare	25.71	32.2	14.45	512.33	1.02	8.05	27.83	4.79
Pclass	17.14	2.31	3	2	2.27	3	0.34	-0.63
Sex	15.71	nan	nan	nan	1.84	male	0.22	nan
SibSp	8.57	0.52	0	8	2.91	0	0.79	3.7
Embarked	5	nan	nan	nan	3.83	S	0.34	nan

	FreqRatio	BadFreq?	%Unique	Var	BAD?
Parch	5.75	1	0.79	0.65	True
Embarked	3.83	1	0.34	nan	True
SibSp	2.91	0	0.79	1.22	False
Pclass	2.27	0	0.34	0.7	False
Sex	1.84	0	0.22	nan	False
Survived	1.61	0	0.22	0.24	False
Age	1.11	0	9.88	211.02	False
Fare	1.02	0	27.83	2469.44	False
PassengerId	1	0	100	66231	False
Name	1	0	100	nan	False
Ticket	1	0	76.43	nan	False
Cabin	1	0	16.5	nan	False

	Pclass	Age	SibSp	Parch	Fare	Pclass_1.0	Pclass_2.0	Pclass_3.0	Sex_female	Sex_male	...	Parch_0.0	Parch_1.0	Embarked_C	Embarked_S
204	-1.0	-0.205882	0.0	0.0	-0.171255	0.0	1.0	-1.0	0.0	0.0	...	0.0	0.0	0.0	0.0
85	0.0	-0.759688	1.0	0.0	0.000000	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	1.0	-1.0
394	0.0	0.029412	3.0	1.0	0.327888	0.0	0.0	0.0	0.0	0.0	...	-1.0	1.0	0.0	0.0
395	-2.0	-0.617647	1.0	0.0	1.972568	1.0	0.0	-1.0	1.0	-1.0	...	0.0	0.0	0.0	0.0
213	-1.0	1.852941	1.0	0.0	0.500043	0.0	1.0	-1.0	1.0	-1.0	...	0.0	0.0	0.0	0.0