In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
import sys
sys.path.append("..")

Now you can get extra information for the profiler if you activate pass verbose= True to optimus



In [3]:

    
# Create optimus
from optimus import Optimus
op = Optimus(master="local[*]", app_name = "optimus" , checkpoint= True, verbose=True)









    



C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  from collections import Callable

    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible. 






    



f657036e-90b5-4391-8d48-fb3f16f6b8ae






    



INFO:optimus:Operative System:Windows
INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars "file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar" --driver-class-path "C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar" --conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:Setting checkpoint folder local. If you are in a cluster initialize Optimus with master='your_ip' as param
INFO:optimus:Deleting previous folder if exists...
INFO:optimus:Creating the checkpoint directory...
INFO:optimus:
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              
INFO:optimus:Transform and Roll out...
INFO:optimus:Optimus successfully imported. Have fun :).
INFO:optimus:Config.ini not found



In [4]:

    
df = op.load.csv("data/Meteorite_Landings.csv").h_repartition()



In [5]:

    
df.table(10)









    









Viewing 10 of 45716 rows / 10 columns
32 partition(s)


    
    
        
        
            name
            1 (string)
            
                
                nullable
                
            
        
        
        
            id
            2 (int)
            
                
                nullable
                
            
        
        
        
            nametype
            3 (string)
            
                
                nullable
                
            
        
        
        
            recclass
            4 (string)
            
                
                nullable
                
            
        
        
        
            mass (g)
            5 (double)
            
                
                nullable
                
            
        
        
        
            fall
            6 (string)
            
                
                nullable
                
            
        
        
        
            year
            7 (string)
            
                
                nullable
                
            
        
        
        
            reclat
            8 (double)
            
                
                nullable
                
            
        
        
        
            reclong
            9 (double)
            
                
                nullable
                
            
        
        
        
            GeoLocation
            10 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            Acfer⋅232
            
        
        
        
            240
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            725.0
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            27.73944
            
        
        
        
            4.32833
            
        
        
        
            (27.739440,⋅4.328330)
            
        
        
    
    
    
        
        
            Elephant⋅Moraine⋅90232
            
        
        
        
            8641
            
        
        
        
            Valid
            
        
        
        
            L6
            
        
        
        
            16.9
            
        
        
        
            Found
            
        
        
        
            01/01/1990⋅12:00:00⋅AM
            
        
        
        
            -76.28795
            
        
        
        
            156.46841
            
        
        
        
            (-76.287950,⋅156.468410)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020090
            
        
        
        
            30681
            
        
        
        
            Valid
            
        
        
        
            Martian⋅(shergottite)
            
        
        
        
            7.5
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.99944
            
        
        
        
            75.26111
            
        
        
        
            (-72.999440,⋅75.261110)
            
        
        
    
    
    
        
        
            Northwest⋅Africa⋅891
            
        
        
        
            31912
            
        
        
        
            Valid
            
        
        
        
            H4
            
        
        
        
            70.8
            
        
        
        
            Found
            
        
        
        
            01/01/2001⋅12:00:00⋅AM
            
        
        
        
            None
            
        
        
        
            None
            
        
        
        
            None
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅93098
            
        
        
        
            19187
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            1.2
            
        
        
        
            Found
            
        
        
        
            01/01/1993⋅12:00:00⋅AM
            
        
        
        
            -84.5757
            
        
        
        
            162.56524
            
        
        
        
            (-84.575700,⋅162.565240)
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅94691
            
        
        
        
            20322
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            9.6
            
        
        
        
            Found
            
        
        
        
            01/01/1994⋅12:00:00⋅AM
            
        
        
        
            -84.0
            
        
        
        
            168.0
            
        
        
        
            (-84.000000,⋅168.000000)
            
        
        
    
    
    
        
        
            Meteorite⋅Hills⋅00977
            
        
        
        
            16211
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            13.2
            
        
        
        
            Found
            
        
        
        
            01/01/2000⋅12:00:00⋅AM
            
        
        
        
            -79.68333
            
        
        
        
            159.75
            
        
        
        
            (-79.683330,⋅159.750000)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020114
            
        
        
        
            46531
            
        
        
        
            Valid
            
        
        
        
            L3
            
        
        
        
            1.0
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.98194
            
        
        
        
            75.25167
            
        
        
        
            (-72.981940,⋅75.251670)
            
        
        
    
    
    
        
        
            Pecora⋅Escarpment⋅91483
            
        
        
        
            18774
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            5.5
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            -85.55819
            
        
        
        
            -68.31586
            
        
        
        
            (-85.558190,⋅-68.315860)
            
        
        
    
    
    
        
        
            Ramlat⋅as⋅Sahmah⋅390
            
        
        
        
            55656
            
        
        
        
            Valid
            
        
        
        
            H3.8-6
            
        
        
        
            0.69
            
        
        
        
            Found
            
        
        
        
            01/01/2010⋅12:00:00⋅AM
            
        
        
        
            20.0949
            
        
        
        
            55.69318
            
        
        
        
            (20.094900,⋅55.693180)
            
        
        
    
    
    



Viewing 10 of 45716 rows / 10 columns
32 partition(s)

Profiler dump mode (Faster). It just handle the column data type as present in the dataframe



In [6]:

    
op.profiler.run(df, "name", infer=False, approx_count= True)









    



{'name': 'string', 'id': 'int', 'nametype': 'string', 'recclass': 'string', 'mass (g)': 'double', 'fall': 'string', 'year': 'string', 'reclat': 'double', 'reclong': 'double', 'GeoLocation': 'string'}
Including 'nan' as Null in processing 'name'
Including 'nan' as Null in processing 'name'
Including 'nan' as Null in processing 'nametype'
Including 'nan' as Null in processing 'recclass'
Including 'nan' as Null in processing 'fall'
Including 'nan' as Null in processing 'year'
Including 'nan' as Null in processing 'GeoLocation'






    






    Overview


    
        Dataset info
        
            
            
                Number of columns
                10

            
            
                Number of rows
                45716

            
            
                Total Missing (%)
                0.49%

            
            
                Total size in memory
                64.0 MB

            
            
        
    
    
        Column types
        
            
            
                String
                1

            
            
                Numeric
                0

            
            
                Date
                0

            
            
                Bool
                0

            
            
                Array
                0

            
            
                Not available
                0

            
            
        
    




    

        

        
            
                name
                categorical
            
            
                
                
                    Unique
                     
                
                
                    Unique (%)
                     99.56
                
                
                    Missing
                    0.0
                
                
                    Missing (%)
                    0
                
                
            
            
                
                    Datatypes
                
            
            
                
                
                    
                        String
                    
                    
                        45716
                    
                
                
                    
                        Integer
                    
                    
                        0
                    
                
                
                    
                        Float
                    
                    
                        0
                    
                
                
                    
                        Bool
                    
                    
                        0
                    
                
                
                    
                        Date
                    
                    
                        0
                    
                
                
                    
                        Missing
                    
                    
                        0
                    
                
                
                    
                        Null
                    
                    
                        0
                    

                
                
            
            

        
        
            Frequency
            

                
                    Value
                    Count
                    Frequency (%)
                
                
                
                    Święcany
                    1
                    0.002%
                

                
                
                    Łowicz
                    1
                    0.002%
                

                
                
                    Österplana 064
                    1
                    0.002%
                

                
                
                    Österplana 063
                    1
                    0.002%
                

                
                
                    Österplana 062
                    1
                    0.002%
                

                
                
                    Österplana 061
                    1
                    0.002%
                

                
                
                    Österplana 060
                    1
                    0.002%
                

                
                
                    Österplana 059
                    1
                    0.002%
                

                
                
                    Österplana 058
                    1
                    0.002%
                

                
                
                    Österplana 057
                    1
                    0.002%
                

                
                
                    "Missing"
                    0
                    0.0%
                

            
        
        

        
    
    
        
        

            

                
                    
                
            
        
        
        
        
            
                
                    
                

            

        
        
        
        
        
        
        

    





Viewing 10 of 45716 rows / 10 columns
32 partition(s)


    
    
        
        
            name
            1 (string)
            
                
                nullable
                
            
        
        
        
            id
            2 (int)
            
                
                nullable
                
            
        
        
        
            nametype
            3 (string)
            
                
                nullable
                
            
        
        
        
            recclass
            4 (string)
            
                
                nullable
                
            
        
        
        
            mass (g)
            5 (double)
            
                
                nullable
                
            
        
        
        
            fall
            6 (string)
            
                
                nullable
                
            
        
        
        
            year
            7 (string)
            
                
                nullable
                
            
        
        
        
            reclat
            8 (double)
            
                
                nullable
                
            
        
        
        
            reclong
            9 (double)
            
                
                nullable
                
            
        
        
        
            GeoLocation
            10 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            Acfer⋅232
            
        
        
        
            240
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            725.0
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            27.73944
            
        
        
        
            4.32833
            
        
        
        
            (27.739440,⋅4.328330)
            
        
        
    
    
    
        
        
            Elephant⋅Moraine⋅90232
            
        
        
        
            8641
            
        
        
        
            Valid
            
        
        
        
            L6
            
        
        
        
            16.9
            
        
        
        
            Found
            
        
        
        
            01/01/1990⋅12:00:00⋅AM
            
        
        
        
            -76.28795
            
        
        
        
            156.46841
            
        
        
        
            (-76.287950,⋅156.468410)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020090
            
        
        
        
            30681
            
        
        
        
            Valid
            
        
        
        
            Martian⋅(shergottite)
            
        
        
        
            7.5
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.99944
            
        
        
        
            75.26111
            
        
        
        
            (-72.999440,⋅75.261110)
            
        
        
    
    
    
        
        
            Northwest⋅Africa⋅891
            
        
        
        
            31912
            
        
        
        
            Valid
            
        
        
        
            H4
            
        
        
        
            70.8
            
        
        
        
            Found
            
        
        
        
            01/01/2001⋅12:00:00⋅AM
            
        
        
        
            None
            
        
        
        
            None
            
        
        
        
            None
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅93098
            
        
        
        
            19187
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            1.2
            
        
        
        
            Found
            
        
        
        
            01/01/1993⋅12:00:00⋅AM
            
        
        
        
            -84.5757
            
        
        
        
            162.56524
            
        
        
        
            (-84.575700,⋅162.565240)
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅94691
            
        
        
        
            20322
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            9.6
            
        
        
        
            Found
            
        
        
        
            01/01/1994⋅12:00:00⋅AM
            
        
        
        
            -84.0
            
        
        
        
            168.0
            
        
        
        
            (-84.000000,⋅168.000000)
            
        
        
    
    
    
        
        
            Meteorite⋅Hills⋅00977
            
        
        
        
            16211
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            13.2
            
        
        
        
            Found
            
        
        
        
            01/01/2000⋅12:00:00⋅AM
            
        
        
        
            -79.68333
            
        
        
        
            159.75
            
        
        
        
            (-79.683330,⋅159.750000)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020114
            
        
        
        
            46531
            
        
        
        
            Valid
            
        
        
        
            L3
            
        
        
        
            1.0
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.98194
            
        
        
        
            75.25167
            
        
        
        
            (-72.981940,⋅75.251670)
            
        
        
    
    
    
        
        
            Pecora⋅Escarpment⋅91483
            
        
        
        
            18774
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            5.5
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            -85.55819
            
        
        
        
            -68.31586
            
        
        
        
            (-85.558190,⋅-68.315860)
            
        
        
    
    
    
        
        
            Ramlat⋅as⋅Sahmah⋅390
            
        
        
        
            55656
            
        
        
        
            Valid
            
        
        
        
            H3.8-6
            
        
        
        
            0.69
            
        
        
        
            Found
            
        
        
        
            01/01/2010⋅12:00:00⋅AM
            
        
        
        
            20.0949
            
        
        
        
            55.69318
            
        
        
        
            (20.094900,⋅55.693180)
            
        
        
    
    
    



Viewing 10 of 45716 rows / 10 columns
32 partition(s)

Profiler smart mode (Slower). It just try to infer the column data type and present extra data accordingly. From example datetype columns get extra histograms about minutes, day, week and month. Also can detect array types on data.



In [6]:

    
op.profiler.run(df, "GeoLocation",infer=True)









    



INFO:optimus:Processing column 'GeoLocation'...
INFO:optimus:_count_data_types() executed in 36.83 sec
INFO:optimus:count_data_types() executed in 36.84 sec
INFO:optimus:cast_columns() executed in 0.0 sec
INFO:optimus:agg_exprs() executed in 4.67 sec
INFO:optimus:general_stats() executed in 4.68 sec
INFO:optimus:------------------------------
INFO:optimus:Processing column 'GeoLocation'...
INFO:optimus:frequency() executed in 6.22 sec
INFO:optimus:stats_by_column() executed in 0.0 sec
INFO:optimus:Using 'column_exp' to process column 'GeoLocation_len' with function func_col_exp
INFO:optimus:Using 'column_exp' to process column 'GeoLocation_len' with function _bucketizer
INFO:optimus:hist() executed in 4.79 sec
INFO:optimus:hist_string() executed in 8.07 sec






    



Including 'nan' as Null in processing 'name'
Including 'nan' as Null in processing 'nametype'
Including 'nan' as Null in processing 'recclass'
Including 'nan' as Null in processing 'fall'
Including 'nan' as Null in processing 'year'
Including 'nan' as Null in processing 'GeoLocation'






    



INFO:optimus:dataset_info() executed in 3.87 sec






    






    Overview


    
        Dataset info
        
            
            
                Number of columns
                10

            
            
                Number of rows
                45716

            
            
                Total Missing (%)
                0.49%

            
            
                Total size in memory
                80.0 MB

            
            
        
    
    
        Column types
        
            
            
                String
                0

            
            
                Numeric
                0

            
            
                Date
                0

            
            
                Bool
                0

            
            
                Array
                1

            
            
                Not available
                0

            
            
        
    




    

        

        
            
                GeoLocation
                array
            
            
                
                
                    Unique
                     
                
                
                    Unique (%)
                     36.499
                
                
                    Missing
                    0.0
                
                
                    Missing (%)
                    0
                
                
            
            
                
                    Datatypes
                
            
            
                
                
                    
                        String
                    
                    
                        0
                    
                
                
                    
                        Integer
                    
                    
                        0
                    
                
                
                    
                        Float
                    
                    
                        0
                    
                
                
                    
                        Bool
                    
                    
                        0
                    
                
                
                    
                        Date
                    
                    
                        0
                    
                
                
                    
                        Missing
                    
                    
                        0
                    
                
                
                    
                        Null
                    
                    
                        7315
                    

                
                
            
            

        
        
            Frequency
            

                
                    Value
                    Count
                    Frequency (%)
                
                
                
                    None
                    7315
                    16.001%
                

                
                
                    (0.000000, 0.000000)
                    6214
                    13.593%
                

                
                
                    (-71.500000, 35.666670)
                    4761
                    10.414%
                

                
                
                    (-84.000000, 168.000000)
                    3040
                    6.65%
                

                
                
                    (-72.000000, 26.000000)
                    1505
                    3.292%
                

                
                
                    (-79.683330, 159.750000)
                    657
                    1.437%
                

                
                
                    (-76.716670, 159.666670)
                    637
                    1.393%
                

                
                
                    (-76.183330, 157.166670)
                    539
                    1.179%
                

                
                
                    (-79.683330, 155.750000)
                    473
                    1.035%
                

                
                
                    (-84.216670, 160.500000)
                    263
                    0.575%
                

                
                
                    "Missing"
                    0
                    0.0%
                

            
        
        

        
    
    
        
        

            

                
                    
                
            
        
        
        
        
            
                
                    
                

            

        
        
        
        
        
        
        

    





Viewing 10 of 45716 rows / 10 columns
32 partition(s)


    
    
        
        
            name
            1 (string)
            
                
                nullable
                
            
        
        
        
            id
            2 (int)
            
                
                nullable
                
            
        
        
        
            nametype
            3 (string)
            
                
                nullable
                
            
        
        
        
            recclass
            4 (string)
            
                
                nullable
                
            
        
        
        
            mass (g)
            5 (double)
            
                
                nullable
                
            
        
        
        
            fall
            6 (string)
            
                
                nullable
                
            
        
        
        
            year
            7 (string)
            
                
                nullable
                
            
        
        
        
            reclat
            8 (double)
            
                
                nullable
                
            
        
        
        
            reclong
            9 (double)
            
                
                nullable
                
            
        
        
        
            GeoLocation
            10 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            Acfer⋅232
            
        
        
        
            240
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            725.0
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            27.73944
            
        
        
        
            4.32833
            
        
        
        
            (27.739440,⋅4.328330)
            
        
        
    
    
    
        
        
            Elephant⋅Moraine⋅90232
            
        
        
        
            8641
            
        
        
        
            Valid
            
        
        
        
            L6
            
        
        
        
            16.9
            
        
        
        
            Found
            
        
        
        
            01/01/1990⋅12:00:00⋅AM
            
        
        
        
            -76.28795
            
        
        
        
            156.46841
            
        
        
        
            (-76.287950,⋅156.468410)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020090
            
        
        
        
            30681
            
        
        
        
            Valid
            
        
        
        
            Martian⋅(shergottite)
            
        
        
        
            7.5
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.99944
            
        
        
        
            75.26111
            
        
        
        
            (-72.999440,⋅75.261110)
            
        
        
    
    
    
        
        
            Northwest⋅Africa⋅891
            
        
        
        
            31912
            
        
        
        
            Valid
            
        
        
        
            H4
            
        
        
        
            70.8
            
        
        
        
            Found
            
        
        
        
            01/01/2001⋅12:00:00⋅AM
            
        
        
        
            None
            
        
        
        
            None
            
        
        
        
            None
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅93098
            
        
        
        
            19187
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            1.2
            
        
        
        
            Found
            
        
        
        
            01/01/1993⋅12:00:00⋅AM
            
        
        
        
            -84.5757
            
        
        
        
            162.56524
            
        
        
        
            (-84.575700,⋅162.565240)
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅94691
            
        
        
        
            20322
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            9.6
            
        
        
        
            Found
            
        
        
        
            01/01/1994⋅12:00:00⋅AM
            
        
        
        
            -84.0
            
        
        
        
            168.0
            
        
        
        
            (-84.000000,⋅168.000000)
            
        
        
    
    
    
        
        
            Meteorite⋅Hills⋅00977
            
        
        
        
            16211
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            13.2
            
        
        
        
            Found
            
        
        
        
            01/01/2000⋅12:00:00⋅AM
            
        
        
        
            -79.68333
            
        
        
        
            159.75
            
        
        
        
            (-79.683330,⋅159.750000)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020114
            
        
        
        
            46531
            
        
        
        
            Valid
            
        
        
        
            L3
            
        
        
        
            1.0
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.98194
            
        
        
        
            75.25167
            
        
        
        
            (-72.981940,⋅75.251670)
            
        
        
    
    
    
        
        
            Pecora⋅Escarpment⋅91483
            
        
        
        
            18774
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            5.5
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            -85.55819
            
        
        
        
            -68.31586
            
        
        
        
            (-85.558190,⋅-68.315860)
            
        
        
    
    
    
        
        
            Ramlat⋅as⋅Sahmah⋅390
            
        
        
        
            55656
            
        
        
        
            Valid
            
        
        
        
            H3.8-6
            
        
        
        
            0.69
            
        
        
        
            Found
            
        
        
        
            01/01/2010⋅12:00:00⋅AM
            
        
        
        
            20.0949
            
        
        
        
            55.69318
            
        
        
        
            (20.094900,⋅55.693180)
            
        
        
    
    
    



Viewing 10 of 45716 rows / 10 columns
32 partition(s)







    



INFO:optimus:run() executed in 65.79 sec

Plot profile for a specific column



In [8]:

    
op.profiler.run(df, "reclat")









    



Including 'nan' as Null in processing 'name'
Including 'nan' as Null in processing 'nametype'
Including 'nan' as Null in processing 'recclass'
Including 'nan' as Null in processing 'fall'
Including 'nan' as Null in processing 'year'
Including 'nan' as Null in processing 'GeoLocation'






    






    Overview


    
        Dataset info
        
            
            
                Number of columns
                10

            
            
                Number of rows
                45716

            
            
                Total Missing (%)
                0.49%

            
            
                Total size in memory
                100.4 MB

            
            
        
    
    
        Column types
        
            
            
                String
                0

            
            
                Numeric
                1

            
            
                Date
                0

            
            
                Bool
                0

            
            
                Array
                0

            
            
                Not available
                0

            
            
        
    




    

        

        
            
                reclat
                numeric
            
            
                
                
                    Unique
                     
                
                
                    Unique (%)
                     28.806
                
                
                    Missing
                    0.0
                
                
                    Missing (%)
                    0
                
                
            
            
                
                    Datatypes
                
            
            
                
                
                    
                        String
                    
                    
                        0
                    
                
                
                    
                        Integer
                    
                    
                        0
                    
                
                
                    
                        Float
                    
                    
                        0
                    
                
                
                    
                        Bool
                    
                    
                        0
                    
                
                
                    
                        Date
                    
                    
                        0
                    
                
                
                    
                        Missing
                    
                    
                        0
                    
                
                
                    
                        Null
                    
                    
                        7315
                    

                
                
            
            
            
                
                    Basic Stats
                

            
            
                
                
                    Mean
                    -39.12258
                
                
                    Minimum
                    -87.36667
                
                
                    Maximum
                    81.16667
                
                
                    Zeros(%)
                    
                

                
            
            

        
        
            Frequency
            

                
                    Value
                    Count
                    Frequency (%)
                
                
                
                    None
                    7315
                    16.001%
                

                
                
                    0.0
                    6438
                    14.083%
                

                
                
                    -71.5
                    4761
                    10.414%
                

                
                
                    -84.0
                    3040
                    6.65%
                

                
                
                    -72.0
                    1506
                    3.294%
                

                
                
                    -79.68333
                    1130
                    2.472%
                

                
                
                    -76.71667
                    680
                    1.487%
                

                
                
                    -76.18333
                    539
                    1.179%
                

                
                
                    -84.21667
                    263
                    0.575%
                

                
                
                    -86.36667
                    226
                    0.494%
                

                
                
                    "Missing"
                    0
                    0.0%
                

            
        
        

        
        


            Quantile statistics
            
                
                
                    Minimum
                    -87.36667
                
                
                    5-th percentile
                    
                
                
                    Q1
                    
                
                
                    Median
                    
                
                
                    Q3
                    
                
                
                    95-th percentile
                    
                
                
                    Maximum
                    81.16667
                
                
                    Range
                    168.53334
                
                
                    Interquartile range
                    0.0
                
                
            
        
        
            Descriptive statistics
            
                
                
                    Standard deviation
                    46.37851
                
                
                    Coef of variation
                    -1.18547
                
                
                    Kurtosis
                    -1.4768
                
                
                    Mean
                    -39.12258
                
                
                    MAD
                    0.0
                
                
                    Skewness
                    
                
                
                    Sum
                    -1502346.20654
                
                
                    Variance
                    2150.96632
                
                
            
        
        
    
    
        
        

            

                
                    
                
            
        
        
        
        
            
                
                    
                

            

        
        
        
        
        
        
        

    





Viewing 10 of 45716 rows / 10 columns
32 partition(s)


    
    
        
        
            name
            1 (string)
            
                
                nullable
                
            
        
        
        
            id
            2 (int)
            
                
                nullable
                
            
        
        
        
            nametype
            3 (string)
            
                
                nullable
                
            
        
        
        
            recclass
            4 (string)
            
                
                nullable
                
            
        
        
        
            mass (g)
            5 (double)
            
                
                nullable
                
            
        
        
        
            fall
            6 (string)
            
                
                nullable
                
            
        
        
        
            year
            7 (string)
            
                
                nullable
                
            
        
        
        
            reclat
            8 (double)
            
                
                nullable
                
            
        
        
        
            reclong
            9 (double)
            
                
                nullable
                
            
        
        
        
            GeoLocation
            10 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            Acfer⋅232
            
        
        
        
            240
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            725.0
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            27.73944
            
        
        
        
            4.32833
            
        
        
        
            (27.739440,⋅4.328330)
            
        
        
    
    
    
        
        
            Elephant⋅Moraine⋅90232
            
        
        
        
            8641
            
        
        
        
            Valid
            
        
        
        
            L6
            
        
        
        
            16.9
            
        
        
        
            Found
            
        
        
        
            01/01/1990⋅12:00:00⋅AM
            
        
        
        
            -76.28795
            
        
        
        
            156.46841
            
        
        
        
            (-76.287950,⋅156.468410)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020090
            
        
        
        
            30681
            
        
        
        
            Valid
            
        
        
        
            Martian⋅(shergottite)
            
        
        
        
            7.5
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.99944
            
        
        
        
            75.26111
            
        
        
        
            (-72.999440,⋅75.261110)
            
        
        
    
    
    
        
        
            Northwest⋅Africa⋅891
            
        
        
        
            31912
            
        
        
        
            Valid
            
        
        
        
            H4
            
        
        
        
            70.8
            
        
        
        
            Found
            
        
        
        
            01/01/2001⋅12:00:00⋅AM
            
        
        
        
            None
            
        
        
        
            None
            
        
        
        
            None
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅93098
            
        
        
        
            19187
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            1.2
            
        
        
        
            Found
            
        
        
        
            01/01/1993⋅12:00:00⋅AM
            
        
        
        
            -84.5757
            
        
        
        
            162.56524
            
        
        
        
            (-84.575700,⋅162.565240)
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅94691
            
        
        
        
            20322
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            9.6
            
        
        
        
            Found
            
        
        
        
            01/01/1994⋅12:00:00⋅AM
            
        
        
        
            -84.0
            
        
        
        
            168.0
            
        
        
        
            (-84.000000,⋅168.000000)
            
        
        
    
    
    
        
        
            Meteorite⋅Hills⋅00977
            
        
        
        
            16211
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            13.2
            
        
        
        
            Found
            
        
        
        
            01/01/2000⋅12:00:00⋅AM
            
        
        
        
            -79.68333
            
        
        
        
            159.75
            
        
        
        
            (-79.683330,⋅159.750000)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020114
            
        
        
        
            46531
            
        
        
        
            Valid
            
        
        
        
            L3
            
        
        
        
            1.0
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.98194
            
        
        
        
            75.25167
            
        
        
        
            (-72.981940,⋅75.251670)
            
        
        
    
    
    
        
        
            Pecora⋅Escarpment⋅91483
            
        
        
        
            18774
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            5.5
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            -85.55819
            
        
        
        
            -68.31586
            
        
        
        
            (-85.558190,⋅-68.315860)
            
        
        
    
    
    
        
        
            Ramlat⋅as⋅Sahmah⋅390
            
        
        
        
            55656
            
        
        
        
            Valid
            
        
        
        
            H3.8-6
            
        
        
        
            0.69
            
        
        
        
            Found
            
        
        
        
            01/01/2010⋅12:00:00⋅AM
            
        
        
        
            20.0949
            
        
        
        
            55.69318
            
        
        
        
            (20.094900,⋅55.693180)
            
        
        
    
    
    



Viewing 10 of 45716 rows / 10 columns
32 partition(s)

Output a json file

Plot histagram for multiple columns



In [9]:

    
df.plot.hist(["id", "reclong"], 20)



In [10]:

    
df.plot.frequency(["id", "reclong"], 10)



In [11]:

    
df.table()









    









Viewing 10 of 45716 rows / 10 columns
32 partition(s)


    
    
        
        
            name
            1 (string)
            
                
                nullable
                
            
        
        
        
            id
            2 (int)
            
                
                nullable
                
            
        
        
        
            nametype
            3 (string)
            
                
                nullable
                
            
        
        
        
            recclass
            4 (string)
            
                
                nullable
                
            
        
        
        
            mass (g)
            5 (double)
            
                
                nullable
                
            
        
        
        
            fall
            6 (string)
            
                
                nullable
                
            
        
        
        
            year
            7 (string)
            
                
                nullable
                
            
        
        
        
            reclat
            8 (double)
            
                
                nullable
                
            
        
        
        
            reclong
            9 (double)
            
                
                nullable
                
            
        
        
        
            GeoLocation
            10 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            Acfer⋅232
            
        
        
        
            240
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            725.0
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            27.73944
            
        
        
        
            4.32833
            
        
        
        
            (27.739440,⋅4.328330)
            
        
        
    
    
    
        
        
            Elephant⋅Moraine⋅90232
            
        
        
        
            8641
            
        
        
        
            Valid
            
        
        
        
            L6
            
        
        
        
            16.9
            
        
        
        
            Found
            
        
        
        
            01/01/1990⋅12:00:00⋅AM
            
        
        
        
            -76.28795
            
        
        
        
            156.46841
            
        
        
        
            (-76.287950,⋅156.468410)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020090
            
        
        
        
            30681
            
        
        
        
            Valid
            
        
        
        
            Martian⋅(shergottite)
            
        
        
        
            7.5
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.99944
            
        
        
        
            75.26111
            
        
        
        
            (-72.999440,⋅75.261110)
            
        
        
    
    
    
        
        
            Northwest⋅Africa⋅891
            
        
        
        
            31912
            
        
        
        
            Valid
            
        
        
        
            H4
            
        
        
        
            70.8
            
        
        
        
            Found
            
        
        
        
            01/01/2001⋅12:00:00⋅AM
            
        
        
        
            None
            
        
        
        
            None
            
        
        
        
            None
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅93098
            
        
        
        
            19187
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            1.2
            
        
        
        
            Found
            
        
        
        
            01/01/1993⋅12:00:00⋅AM
            
        
        
        
            -84.5757
            
        
        
        
            162.56524
            
        
        
        
            (-84.575700,⋅162.565240)
            
        
        
    
    
    
        
        
            Queen⋅Alexandra⋅Range⋅94691
            
        
        
        
            20322
            
        
        
        
            Valid
            
        
        
        
            H6
            
        
        
        
            9.6
            
        
        
        
            Found
            
        
        
        
            01/01/1994⋅12:00:00⋅AM
            
        
        
        
            -84.0
            
        
        
        
            168.0
            
        
        
        
            (-84.000000,⋅168.000000)
            
        
        
    
    
    
        
        
            Meteorite⋅Hills⋅00977
            
        
        
        
            16211
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            13.2
            
        
        
        
            Found
            
        
        
        
            01/01/2000⋅12:00:00⋅AM
            
        
        
        
            -79.68333
            
        
        
        
            159.75
            
        
        
        
            (-79.683330,⋅159.750000)
            
        
        
    
    
    
        
        
            Grove⋅Mountains⋅020114
            
        
        
        
            46531
            
        
        
        
            Valid
            
        
        
        
            L3
            
        
        
        
            1.0
            
        
        
        
            Found
            
        
        
        
            01/01/2003⋅12:00:00⋅AM
            
        
        
        
            -72.98194
            
        
        
        
            75.25167
            
        
        
        
            (-72.981940,⋅75.251670)
            
        
        
    
    
    
        
        
            Pecora⋅Escarpment⋅91483
            
        
        
        
            18774
            
        
        
        
            Valid
            
        
        
        
            H5
            
        
        
        
            5.5
            
        
        
        
            Found
            
        
        
        
            01/01/1991⋅12:00:00⋅AM
            
        
        
        
            -85.55819
            
        
        
        
            -68.31586
            
        
        
        
            (-85.558190,⋅-68.315860)
            
        
        
    
    
    
        
        
            Ramlat⋅as⋅Sahmah⋅390
            
        
        
        
            55656
            
        
        
        
            Valid
            
        
        
        
            H3.8-6
            
        
        
        
            0.69
            
        
        
        
            Found
            
        
        
        
            01/01/2010⋅12:00:00⋅AM
            
        
        
        
            20.0949
            
        
        
        
            55.69318
            
        
        
        
            (20.094900,⋅55.693180)
            
        
        
    
    
    



Viewing 10 of 45716 rows / 10 columns
32 partition(s)



In [12]:

    
df.cols.count_na("*")









    



Including 'nan' as Null in processing 'name'
Including 'nan' as Null in processing 'nametype'
Including 'nan' as Null in processing 'recclass'
Including 'nan' as Null in processing 'fall'
Including 'nan' as Null in processing 'year'
Including 'nan' as Null in processing 'GeoLocation'






    Out[12]:





{'name': 0,
 'id': 0,
 'nametype': 0,
 'recclass': 0,
 'mass (g)': 131,
 'fall': 0,
 'year': 288,
 'reclat': 7315,
 'reclong': 7315,
 'GeoLocation': 7315}



In [13]:

    
a = {'name': 0,
 'id': 0,
 'nametype': 0,
 'recclass': 0,
 'mass (g)': 131,
 'fall': 0,
 'year': 288,
 'reclat': 7315,
 'reclong': 7315,
 'GeoLocation': 7315}

    df.cols.dtypes()









    



  File "<ipython-input-13-1cf015ba60d7>", line 12
    df.cols.dtypes()
    ^
IndentationError: unexpected indent



In [ ]:

    
cols = ["id","mass (g)","reclat"]
# We drops nulls because correlation can not handle them
df_not_nulls = df.rows.drop_na(cols)

df_not_nulls.plot.correlation(cols)



In [45]:

    
df_not_nulls.cols.correlation(["id","mass (g)", "reclat"], output="array")









    Out[45]:





{'cols': ['id', 'mass (g)', 'reclat'],
 'data': array([[ 1.        , -0.01794746,  0.27151272],
        [-0.01794746,  1.        ,  0.02908721],
        [ 0.27151272,  0.02908721,  1.        ]])}



In [ ]:

name 1 (string) nullable	id 2 (int) nullable	nametype 3 (string) nullable	recclass 4 (string) nullable	mass (g) 5 (double) nullable	fall 6 (string) nullable	year 7 (string) nullable	reclat 8 (double) nullable	reclong 9 (double) nullable	GeoLocation 10 (string) nullable
Acfer⋅232	240	Valid	H5	725.0	Found	01/01/1991⋅12:00:00⋅AM	27.73944	4.32833	(27.739440,⋅4.328330)
Elephant⋅Moraine⋅90232	8641	Valid	L6	16.9	Found	01/01/1990⋅12:00:00⋅AM	-76.28795	156.46841	(-76.287950,⋅156.468410)
Grove⋅Mountains⋅020090	30681	Valid	Martian⋅(shergottite)	7.5	Found	01/01/2003⋅12:00:00⋅AM	-72.99944	75.26111	(-72.999440,⋅75.261110)
Northwest⋅Africa⋅891	31912	Valid	H4	70.8	Found	01/01/2001⋅12:00:00⋅AM	None	None	None
Queen⋅Alexandra⋅Range⋅93098	19187	Valid	H6	1.2	Found	01/01/1993⋅12:00:00⋅AM	-84.5757	162.56524	(-84.575700,⋅162.565240)
Queen⋅Alexandra⋅Range⋅94691	20322	Valid	H6	9.6	Found	01/01/1994⋅12:00:00⋅AM	-84.0	168.0	(-84.000000,⋅168.000000)
Meteorite⋅Hills⋅00977	16211	Valid	H5	13.2	Found	01/01/2000⋅12:00:00⋅AM	-79.68333	159.75	(-79.683330,⋅159.750000)
Grove⋅Mountains⋅020114	46531	Valid	L3	1.0	Found	01/01/2003⋅12:00:00⋅AM	-72.98194	75.25167	(-72.981940,⋅75.251670)
Pecora⋅Escarpment⋅91483	18774	Valid	H5	5.5	Found	01/01/1991⋅12:00:00⋅AM	-85.55819	-68.31586	(-85.558190,⋅-68.315860)
Ramlat⋅as⋅Sahmah⋅390	55656	Valid	H3.8-6	0.69	Found	01/01/2010⋅12:00:00⋅AM	20.0949	55.69318	(20.094900,⋅55.693180)

Number of columns	10
Number of rows	45716
Total Missing (%)	0.49%
Total size in memory	64.0 MB

Value	Count	Frequency (%)
Święcany	1	0.002%
Łowicz	1	0.002%
Österplana 064	1	0.002%
Österplana 063	1	0.002%
Österplana 062	1	0.002%
Österplana 061	1	0.002%
Österplana 060	1	0.002%
Österplana 059	1	0.002%
Österplana 058	1	0.002%
Österplana 057	1	0.002%
"Missing"	0	0.0%

Value	Count	Frequency (%)
None	7315	16.001%
(0.000000, 0.000000)	6214	13.593%
(-71.500000, 35.666670)	4761	10.414%
(-84.000000, 168.000000)	3040	6.65%
(-72.000000, 26.000000)	1505	3.292%
(-79.683330, 159.750000)	657	1.437%
(-76.716670, 159.666670)	637	1.393%
(-76.183330, 157.166670)	539	1.179%
(-79.683330, 155.750000)	473	1.035%
(-84.216670, 160.500000)	263	0.575%
"Missing"	0	0.0%

Minimum	-87.36667
5-th percentile
Q1
Median
Q3
95-th percentile
Maximum	81.16667
Range	168.53334
Interquartile range	0.0

Standard deviation	46.37851
Coef of variation	-1.18547
Kurtosis	-1.4768
Mean	-39.12258
MAD	0.0
Skewness
Sum	-1502346.20654
Variance	2150.96632