notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
import sys
sys.path.append("..")



In [3]:

    
from infer import Infer
Infer.value(12, "string")









    Out[3]:





False



In [3]:

    
from optimus.optimus import Optimus









    



    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible.



In [5]:

    
# from optimus import parse



In [6]:

    
# from optimus.bumblebee import Comm
# comm = Comm("this_is_the_queue_name")



In [5]:

    
op= Optimus(comm=True)









    




Open Bumblebee: https://app.hi-bumblebee.comIf you really care about privacy get your keys in bumblebee.ini and put them here






    



C:/Users/argenisleon/Documents/Optimus/optimus/../infer.py



In [8]:

    
from pyspark.sql.types import *
from datetime import date, datetime

cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("DateType", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        (None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),

    ]
df = op.create.df(cols ,rows, False).cache().repartition(1)



In [19]:

    
df.table(20)









    









Viewing 19 of 19 rows / 8 columns
1 partition(s)


    
    
        
        
            id
            1 (int)
            
                
                nullable
                
            
        
        
        
            firstName
            2 (string)
            
                
                nullable
                
            
        
        
        
            lastName
            3 (string)
            
                
                nullable
                
            
        
        
        
            billingId
            4 (int)
            
                
                nullable
                
            
        
        
        
            product
            5 (string)
            
                
                nullable
                
            
        
        
        
            price
            6 (int)
            
                
                nullable
                
            
        
        
        
            birth
            7 (string)
            
                
                nullable
                
            
        
        
        
            dummyCol
            8 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Luis
                 
            
        
        
        
            
                
                Alvarez$$%!
                 
            
        
        
        
            
                
                123
                 
            
        
        
        
            
                
                Cake
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                1980/07/07
                 
            
        
        
        
            
                
                never
                 
            
        
        
    
    
    
        
        
            
                
                2
                 
            
        
        
        
            
                
                André
                 
            
        
        
        
            
                
                Ampère
                 
            
        
        
        
            
                
                423
                 
            
        
        
        
            
                
                piza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1950/07/08
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                3
                 
            
        
        
        
            
                
                NiELS
                 
            
        
        
        
            
                
                Böhr//((%%
                 
            
        
        
        
            
                
                551
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/09
                 
            
        
        
        
            
                
                give
                 
            
        
        
    
    
    
        
        
            
                
                4
                 
            
        
        
        
            
                
                PAUL
                 
            
        
        
        
            
                
                dirac$
                 
            
        
        
        
            
                
                521
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1954/07/10
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                5
                 
            
        
        
        
            
                
                Albert
                 
            
        
        
        
            
                
                Einstein
                 
            
        
        
        
            
                
                634
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/11
                 
            
        
        
        
            
                
                up
                 
            
        
        
    
    
    
        
        
            
                
                6
                 
            
        
        
        
            
                
                Galileo
                 
            
        
        
        
            
                
                ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
                 
            
        
        
        
            
                
                672
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                1930/08/12
                 
            
        
        
        
            
                
                never
                 
            
        
        
    
    
    
        
        
            
                
                7
                 
            
        
        
        
            
                
                CaRL
                 
            
        
        
        
            
                
                Ga%%%uss
                 
            
        
        
        
            
                
                323
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1970/07/13
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                8
                 
            
        
        
        
            
                
                David
                 
            
        
        
        
            
                
                H$$$ilbert
                 
            
        
        
        
            
                
                624
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1950/07/14
                 
            
        
        
        
            
                
                let
                 
            
        
        
    
    
    
        
        
            
                
                9
                 
            
        
        
        
            
                
                Johannes
                 
            
        
        
        
            
                
                KEPLER
                 
            
        
        
        
            
                
                735
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1920/04/22
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                10
                 
            
        
        
        
            
                
                JaMES
                 
            
        
        
        
            
                
                M$$ax%%well
                 
            
        
        
        
            
                
                875
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1923/03/12
                 
            
        
        
        
            
                
                down
                 
            
        
        
    
    
    
        
        
            
                
                11
                 
            
        
        
        
            
                
                Isaac
                 
            
        
        
        
            
                
                Newton
                 
            
        
        
        
            
                
                992
                 
            
        
        
        
            
                
                pasta
                 
            
        
        
        
            
                
                9
                 
            
        
        
        
            
                
                1999/02/15
                 
            
        
        
        
            
                
                never⋅
                 
            
        
        
    
    
    
        
        
            
                
                12
                 
            
        
        
        
            
                
                Emmy%%
                 
            
        
        
        
            
                
                Nöether$
                 
            
        
        
        
            
                
                234
                 
            
        
        
        
            
                
                pasta
                 
            
        
        
        
            
                
                9
                 
            
        
        
        
            
                
                1993/12/08
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                13
                 
            
        
        
        
            
                
                Max!!!
                 
            
        
        
        
            
                
                Planck!!!
                 
            
        
        
        
            
                
                111
                 
            
        
        
        
            
                
                hamburguer
                 
            
        
        
        
            
                
                4
                 
            
        
        
        
            
                
                1994/01/04
                 
            
        
        
        
            
                
                run⋅
                 
            
        
        
    
    
    
        
        
            
                
                14
                 
            
        
        
        
            
                
                Fred
                 
            
        
        
        
            
                
                Hoy&&≤
                 
            
        
        
        
            
                
                553
                 
            
        
        
        
            
                
                pizzza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1997/06/27
                 
            
        
        
        
            
                
                around
                 
            
        
        
    
    
    
        
        
            
                
                15
                 
            
        
        
        
            
                
                (((⋅⋅⋅Heinrich⋅)))))
                 
            
        
        
        
            
                
                Hertz
                 
            
        
        
        
            
                
                116
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1956/11/30
                 
            
        
        
        
            
                
                and
                 
            
        
        
    
    
    
        
        
            
                
                16
                 
            
        
        
        
            
                
                William
                 
            
        
        
        
            
                
                Gilbert###
                 
            
        
        
        
            
                
                886
                 
            
        
        
        
            
                
                BEER
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                1958/03/26
                 
            
        
        
        
            
                
                desert
                 
            
        
        
    
    
    
        
        
            
                
                17
                 
            
        
        
        
            
                
                Marie
                 
            
        
        
        
            
                
                CURIE
                 
            
        
        
        
            
                
                912
                 
            
        
        
        
            
                
                Rice
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                2000/03/22
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                18
                 
            
        
        
        
            
                
                Arthur
                 
            
        
        
        
            
                
                COM%%%pton
                 
            
        
        
        
            
                
                812
                 
            
        
        
        
            
                
                110790
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                1899/01/01
                 
            
        
        
        
            
                
                #
                 
            
        
        
    
    
    
        
        
            
                
                19
                 
            
        
        
        
            
                
                JAMES
                 
            
        
        
        
            
                
                Chadwick
                 
            
        
        
        
            
                
                467
                 
            
        
        
        
            
                
                null
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                1921/05/03
                 
            
        
        
        
            
                
                #
                 
            
        
        
    
    
    



Viewing 19 of 19 rows / 8 columns
1 partition(s)



In [6]:

    
df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")



In [7]:

    
outlier = df.outliers.mad("price", threshold = 1)



In [8]:

    
outlier.hist("price")









    Out[8]:





'{"price": {"hist": [{"count": 6.0, "lower": 8.0, "upper": 8.1}, {"count": 0.0, "lower": 8.1, "upper": 8.2}, {"count": 0.0, "lower": 8.2, "upper": 8.3}, {"count": 0.0, "lower": 8.3, "upper": 8.4}, {"count": 0.0, "lower": 8.4, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 8.6}, {"count": 0.0, "lower": 8.6, "upper": 8.7}, {"count": 0.0, "lower": 8.7, "upper": 8.8}, {"count": 0.0, "lower": 8.8, "upper": 8.9}, {"count": 0.0, "lower": 8.9, "upper": 9.0}, {"count": 2.0, "lower": 9.0, "upper": 9.1}, {"count": 0.0, "lower": 9.1, "upper": 9.2}, {"count": 0.0, "lower": 9.2, "upper": 9.3}, {"count": 0.0, "lower": 9.3, "upper": 9.4}, {"count": 0.0, "lower": 9.4, "upper": 9.5}, {"count": 0.0, "lower": 9.5, "upper": 9.6}, {"count": 0.0, "lower": 9.6, "upper": 9.7}, {"count": 0.0, "lower": 9.7, "upper": 9.8}, {"count": 0.0, "lower": 9.8, "upper": 9.9}, {"count": 0.0, "lower": 9.9, "upper": 10.0}]}}'



In [12]:

    
df.cols.count_by_dtypes("id")









    Out[12]:





{'id': {'null': 0, 'missing': 0, 'int': 19}}



In [22]:

    
df.count()









    Out[22]:





19



In [24]:

    
outlier.info()









    



6






    Out[24]:





{'count_outliers': 9,
 'count_non_outliers': 10,
 'lower_bound': 6,
 'lower_bound_count': 9,
 'upper_bound': 10,
 'upper_bound_count': 0}



In [11]:

    
# df.table()



In [12]:

    
df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})









    Out[12]:





{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}



In [14]:

    
a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}



In [15]:

    
tuple({"firstName":"string","lastName":"array"}.values())









    Out[15]:





('string', 'array')



In [16]:

    
from infer import Infer



In [17]:

    
from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})









    Out[17]:





(('names', 'null'), 1)



In [20]:

    
Infer.value(12, "string")



In [36]:

    
list({"firstName":"string","lastName":"string"}.keys())









    Out[36]:





['firstName', 'lastName']



In [8]:

    
df.rows.select_by_dtypes("names","str")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-5a6988a57346> in <module>
----> 1 df.rows.select_by_dtypes("names","str")

NameError: name 'df' is not defined



In [117]:

    
# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()









    









Viewing 3 of 3 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                1⋅Megatron
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 3 of 3 rows / 16 columns
1 partition(s)



In [ ]:



In [55]:

    
df.cols.reverse("function").table()









    









Viewing 6 of 6 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                redaeL
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                eganoipsE
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                ytiruceS
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                1⋅Megatron
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                tnanetueiL⋅tsriF
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                1⋅Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                enoN
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                megatron⋅1
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                noitatS⋅elttaB
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 16 columns
1 partition(s)



In [20]:

    
outlier = df.outliers.tukey("mass (g)")



In [28]:

    
# print(outlier.info())
outlier.select_lower_bound()









    Out[28]:





'{"columns": [{"title": "mass (g)"}], "value": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'



In [256]:

    
keyCol.fingerprint(df,"product").table()









    









Viewing 10 of 19 rows / 9 columns
1 partition(s)


    
    
        
        
            id
            1 (int)
            
                
                nullable
                
            
        
        
        
            firstName
            2 (string)
            
                
                nullable
                
            
        
        
        
            lastName
            3 (string)
            
                
                nullable
                
            
        
        
        
            billingId
            4 (int)
            
                
                nullable
                
            
        
        
        
            product
            5 (string)
            
                
                nullable
                
            
        
        
        
            price
            6 (int)
            
                
                nullable
                
            
        
        
        
            birth
            7 (string)
            
                
                nullable
                
            
        
        
        
            dummyCol
            8 (string)
            
                
                nullable
                
            
        
        
        
            product***FINGERPRINT
            9 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Luis
                 
            
        
        
        
            
                
                Alvarez$$%!
                 
            
        
        
        
            
                
                123
                 
            
        
        
        
            
                
                Cake
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                1980/07/07
                 
            
        
        
        
            
                
                never
                 
            
        
        
        
            
                
                cake
                 
            
        
        
    
    
    
        
        
            
                
                2
                 
            
        
        
        
            
                
                André
                 
            
        
        
        
            
                
                Ampère
                 
            
        
        
        
            
                
                423
                 
            
        
        
        
            
                
                piza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1950/07/08
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
        
            
                
                piza
                 
            
        
        
    
    
    
        
        
            
                
                3
                 
            
        
        
        
            
                
                NiELS
                 
            
        
        
        
            
                
                Böhr//((%%
                 
            
        
        
        
            
                
                551
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/09
                 
            
        
        
        
            
                
                give
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
    
    
    
        
        
            
                
                4
                 
            
        
        
        
            
                
                PAUL
                 
            
        
        
        
            
                
                dirac$
                 
            
        
        
        
            
                
                521
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1954/07/10
                 
            
        
        
        
            
                
                you
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
    
    
    
        
        
            
                
                5
                 
            
        
        
        
            
                
                Albert
                 
            
        
        
        
            
                
                Einstein
                 
            
        
        
        
            
                
                634
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/11
                 
            
        
        
        
            
                
                up
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
    
    
    
        
        
            
                
                6
                 
            
        
        
        
            
                
                Galileo
                 
            
        
        
        
            
                
                ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
                 
            
        
        
        
            
                
                672
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                1930/08/12
                 
            
        
        
        
            
                
                never
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
    
    
    
        
        
            
                
                7
                 
            
        
        
        
            
                
                CaRL
                 
            
        
        
        
            
                
                Ga%%%uss
                 
            
        
        
        
            
                
                323
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1970/07/13
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
        
            
                
                taco
                 
            
        
        
    
    
    
        
        
            
                
                8
                 
            
        
        
        
            
                
                David
                 
            
        
        
        
            
                
                H$$$ilbert
                 
            
        
        
        
            
                
                624
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1950/07/14
                 
            
        
        
        
            
                
                let
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
    
    
    
        
        
            
                
                9
                 
            
        
        
        
            
                
                Johannes
                 
            
        
        
        
            
                
                KEPLER
                 
            
        
        
        
            
                
                735
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1920/04/22
                 
            
        
        
        
            
                
                you
                 
            
        
        
        
            
                
                taco
                 
            
        
        
    
    
    
        
        
            
                
                10
                 
            
        
        
        
            
                
                JaMES
                 
            
        
        
        
            
                
                M$$ax%%well
                 
            
        
        
        
            
                
                875
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1923/03/12
                 
            
        
        
        
            
                
                down
                 
            
        
        
        
            
                
                taco
                 
            
        
        
    
    
    



Viewing 10 of 19 rows / 9 columns
1 partition(s)



In [245]:

    
keyCol.fingerprint(df,"names").table()









    









Viewing 6 of 6 rows / 17 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
        
            names***FINGERPRINT
            17 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                optimusoptimusprime
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                bumblebee
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ironhide
                 
            
        
        
    
    
    
        
        
            
                
                1⋅Megatron
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1megatron
                 
            
        
        
    
    
    
        
        
            
                
                1⋅Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1megatron
                 
            
        
        
    
    
    
        
        
            
                
                megatron⋅1
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1megatron
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 17 columns
1 partition(s)



In [259]:

    
keyCol.fingerprint_cluster(df,"product", output="json")









    Out[259]:





'{"taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "pizzza": {"similar": {"pizzza": 1}, "count": 1, "sum": 1}, "arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "pizza": {"similar": {"pizza": 4}, "count": 1, "sum": 4}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}}'



In [261]:

    
keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)









    Out[261]:





'{"arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}, "pizza": {"similar": {"pizzza": 1, "pizza": 4}, "count": 2, "sum": 5}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}}'



In [7]:

    
from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc



In [258]:

    
dc.levenshtein_cluster(df,"product", output="json")









    Out[258]:





'{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}'



In [31]:

    
keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")









    









Viewing 6 of 6 rows / 4 columns
1 partition(s)


    
    
        
        
            count
            1 (string)
            
                
                not nullable
                
            
        
        
        
            names
            2 (string)
            
                
                nullable
                
            
        
        
        
            names***NGRAM
            3 (array<string>)
            
                
                not nullable
                
            
        
        
        
            names***NGRAM_FINGERPRINT
            4 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                ['bumblebee']
                 
            
        
        
        
            
                
                bumblebee
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                ['ironhide']
                 
            
        
        
        
            
                
                ironhide
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Megatron2
                 
            
        
        
        
            
                
                ['megatron2']
                 
            
        
        
        
            
                
                megatron2
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                ['optimusoptimusprime']
                 
            
        
        
        
            
                
                optimusoptimusprime
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Megatron1
                 
            
        
        
        
            
                
                ['megatron1']
                 
            
        
        
        
            
                
                megatron1
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                ['megatron']
                 
            
        
        
        
            
                
                megatron
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 4 columns
1 partition(s)







    Out[31]:





'{"ironhide&": {"similar": {"ironhide&": 1}, "count": 1, "sum": 1.0}, "Megatron1": {"similar": {"Megatron1": 1}, "count": 1, "sum": 1.0}, "Optimus OptimusPrime": {"similar": {"Optimus OptimusPrime": 1}, "count": 1, "sum": 1.0}, "Megatron": {"similar": {"Megatron": 1}, "count": 1, "sum": 1.0}, "bumbl#eb\\u00e9\\u00e9  ": {"similar": {"bumbl#eb\\u00e9\\u00e9  ": 1}, "count": 1, "sum": 1.0}, "Megatron2": {"similar": {"Megatron2": 1}, "count": 1, "sum": 1.0}}'



In [25]:

    
df.table()









    









Viewing 6 of 6 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron1
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                megatron
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 16 columns
1 partition(s)



In [81]:

    
# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")



In [82]:

    
df.table()









    









Viewing 6 of 6 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                JaJa⋅JaJaJ
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 16 columns
1 partition(s)



In [95]:

    
df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()









    









Viewing 6 of 6 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optimus⋅OptimusPrime
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                aaa⋅JaJaJ
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 16 columns
1 partition(s)



In [20]:

    
df.send()









    



Send!



In [7]:

    
df.table(20)









    Out[7]:









Viewing 19 of 19 rows / 8 columns
1 partition(s)


    
    
        
        
            id
            1 (int)
            
                
                nullable
                
            
        
        
        
            firstName
            2 (string)
            
                
                nullable
                
            
        
        
        
            lastName
            3 (string)
            
                
                nullable
                
            
        
        
        
            billingId
            4 (int)
            
                
                nullable
                
            
        
        
        
            product
            5 (string)
            
                
                nullable
                
            
        
        
        
            price
            6 (int)
            
                
                nullable
                
            
        
        
        
            birth
            7 (string)
            
                
                nullable
                
            
        
        
        
            dummyCol
            8 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Luis
                 
            
        
        
        
            
                
                Alvarez$$%!
                 
            
        
        
        
            
                
                123
                 
            
        
        
        
            
                
                Cake
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                1980/07/07
                 
            
        
        
        
            
                
                never
                 
            
        
        
    
    
    
        
        
            
                
                2
                 
            
        
        
        
            
                
                André
                 
            
        
        
        
            
                
                Ampère
                 
            
        
        
        
            
                
                423
                 
            
        
        
        
            
                
                piza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1950/07/08
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                3
                 
            
        
        
        
            
                
                NiELS
                 
            
        
        
        
            
                
                Böhr//((%%
                 
            
        
        
        
            
                
                551
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/09
                 
            
        
        
        
            
                
                give
                 
            
        
        
    
    
    
        
        
            
                
                4
                 
            
        
        
        
            
                
                PAUL
                 
            
        
        
        
            
                
                dirac$
                 
            
        
        
        
            
                
                521
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1954/07/10
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                5
                 
            
        
        
        
            
                
                Albert
                 
            
        
        
        
            
                
                Einstein
                 
            
        
        
        
            
                
                634
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1990/07/11
                 
            
        
        
        
            
                
                up
                 
            
        
        
    
    
    
        
        
            
                
                6
                 
            
        
        
        
            
                
                Galileo
                 
            
        
        
        
            
                
                ⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
                 
            
        
        
        
            
                
                672
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                1930/08/12
                 
            
        
        
        
            
                
                never
                 
            
        
        
    
    
    
        
        
            
                
                7
                 
            
        
        
        
            
                
                CaRL
                 
            
        
        
        
            
                
                Ga%%%uss
                 
            
        
        
        
            
                
                323
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1970/07/13
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                8
                 
            
        
        
        
            
                
                David
                 
            
        
        
        
            
                
                H$$$ilbert
                 
            
        
        
        
            
                
                624
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1950/07/14
                 
            
        
        
        
            
                
                let
                 
            
        
        
    
    
    
        
        
            
                
                9
                 
            
        
        
        
            
                
                Johannes
                 
            
        
        
        
            
                
                KEPLER
                 
            
        
        
        
            
                
                735
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1920/04/22
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                10
                 
            
        
        
        
            
                
                JaMES
                 
            
        
        
        
            
                
                M$$ax%%well
                 
            
        
        
        
            
                
                875
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                1923/03/12
                 
            
        
        
        
            
                
                down
                 
            
        
        
    
    
    
        
        
            
                
                11
                 
            
        
        
        
            
                
                Isaac
                 
            
        
        
        
            
                
                Newton
                 
            
        
        
        
            
                
                992
                 
            
        
        
        
            
                
                pasta
                 
            
        
        
        
            
                
                9
                 
            
        
        
        
            
                
                1999/02/15
                 
            
        
        
        
            
                
                never⋅
                 
            
        
        
    
    
    
        
        
            
                
                12
                 
            
        
        
        
            
                
                Emmy%%
                 
            
        
        
        
            
                
                Nöether$
                 
            
        
        
        
            
                
                234
                 
            
        
        
        
            
                
                pasta
                 
            
        
        
        
            
                
                9
                 
            
        
        
        
            
                
                1993/12/08
                 
            
        
        
        
            
                
                gonna
                 
            
        
        
    
    
    
        
        
            
                
                13
                 
            
        
        
        
            
                
                Max!!!
                 
            
        
        
        
            
                
                Planck!!!
                 
            
        
        
        
            
                
                111
                 
            
        
        
        
            
                
                hamburguer
                 
            
        
        
        
            
                
                4
                 
            
        
        
        
            
                
                1994/01/04
                 
            
        
        
        
            
                
                run⋅
                 
            
        
        
    
    
    
        
        
            
                
                14
                 
            
        
        
        
            
                
                Fred
                 
            
        
        
        
            
                
                Hoy&&≤
                 
            
        
        
        
            
                
                553
                 
            
        
        
        
            
                
                pizzza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1997/06/27
                 
            
        
        
        
            
                
                around
                 
            
        
        
    
    
    
        
        
            
                
                15
                 
            
        
        
        
            
                
                (((⋅⋅⋅Heinrich⋅)))))
                 
            
        
        
        
            
                
                Hertz
                 
            
        
        
        
            
                
                116
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1956/11/30
                 
            
        
        
        
            
                
                and
                 
            
        
        
    
    
    
        
        
            
                
                16
                 
            
        
        
        
            
                
                William
                 
            
        
        
        
            
                
                Gilbert###
                 
            
        
        
        
            
                
                886
                 
            
        
        
        
            
                
                BEER
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                1958/03/26
                 
            
        
        
        
            
                
                desert
                 
            
        
        
    
    
    
        
        
            
                
                17
                 
            
        
        
        
            
                
                Marie
                 
            
        
        
        
            
                
                CURIE
                 
            
        
        
        
            
                
                912
                 
            
        
        
        
            
                
                Rice
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                2000/03/22
                 
            
        
        
        
            
                
                you
                 
            
        
        
    
    
    
        
        
            
                
                18
                 
            
        
        
        
            
                
                Arthur
                 
            
        
        
        
            
                
                COM%%%pton
                 
            
        
        
        
            
                
                812
                 
            
        
        
        
            
                
                110790
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                1899/01/01
                 
            
        
        
        
            
                
                #
                 
            
        
        
    
    
    
        
        
            
                
                19
                 
            
        
        
        
            
                
                JAMES
                 
            
        
        
        
            
                
                Chadwick
                 
            
        
        
        
            
                
                467
                 
            
        
        
        
            
                
                null
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                1921/05/03
                 
            
        
        
        
            
                
                #
                 
            
        
        
    
    
    



Viewing 19 of 19 rows / 8 columns
1 partition(s)



In [10]:

    
df.outliers.z_score("price",threshold =1).info()









    Out[10]:





{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}



In [8]:

    
df.outliers.tukey("price").info()









    Out[8]:





{'count_outliers': 0,
 'count_non_outliers': 19,
 'lower_bound': -4.5,
 'lower_bound_count': 0,
 'upper_bound': 15.5,
 'upper_bound_count': 0,
 'iqr1': 3,
 'iqr3': 8}



In [9]:

    
df.outliers.mad("price", threshold =1).info()









    Out[9]:





{'count_outliers': 9,
 'count_non_outliers': 19,
 'lower_bound': 6,
 'lower_bound_count': 9,
 'upper_bound': 10,
 'upper_bound_count': 0}



In [11]:

    
df.outliers.modified_z_score("price",threshold =1).info()









    Out[11]:





{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}



In [47]:

    
%%time
from optimus.ml import distancecluster as dc
print(dc.levenshtein_cluster(df,'product',output="json"))









    



{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}
Wall time: 9.6 s



In [51]:

    
from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc

# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)



In [62]:

    
result = kc.n_gram_fingerprint_cluster(df, "product",3)









    









Viewing 10 of 13 rows / 4 columns
1 partition(s)


    
    
        
        
            count
            1 (string)
            
                
                not nullable
                
            
        
        
        
            product
            2 (string)
            
                
                nullable
                
            
        
        
        
            product***NGRAM
            3 (array<string>)
            
                
                not nullable
                
            
        
        
        
            product***NGRAM_FINGERPRINT
            4 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
        
            
                
                ['taaaccoo']
                 
            
        
        
        
            
                
                taaaccoo
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                piza
                 
            
        
        
        
            
                
                ['piza']
                 
            
        
        
        
            
                
                piza
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                hamburguer
                 
            
        
        
        
            
                
                ['hamburguer']
                 
            
        
        
        
            
                
                hamburguer
                 
            
        
        
    
    
    
        
        
            
                
                3
                 
            
        
        
        
            
                
                taco
                 
            
        
        
        
            
                
                ['taco']
                 
            
        
        
        
            
                
                taco
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                BEER
                 
            
        
        
        
            
                
                ['beer']
                 
            
        
        
        
            
                
                beer
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                pizzza
                 
            
        
        
        
            
                
                ['pizzza']
                 
            
        
        
        
            
                
                pizzza
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
        
            
                
                ['arepa']
                 
            
        
        
        
            
                
                arepa
                 
            
        
        
    
    
    
        
        
            
                
                4
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
        
            
                
                ['pizza']
                 
            
        
        
        
            
                
                pizza
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                Rice
                 
            
        
        
        
            
                
                ['rice']
                 
            
        
        
        
            
                
                rice
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                110790
                 
            
        
        
        
            
                
                ['110790']
                 
            
        
        
        
            
                
                110790
                 
            
        
        
    
    
    



Viewing 10 of 13 rows / 4 columns
1 partition(s)



In [63]:

    
print(result)









    



{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}



In [159]:

    
type(result)









    Out[159]:





str



In [68]:

    
kv_dict ={}
for row in result.collect():
    _row = list(row.asDict().values())
    print(_row)
    kv_dict[_row[0]] = _row[1]









    



['taaaccoo', 1]
['piza', 1]
['hamburguer', 1]
['taco', 3]
['BEER', 1]
['pizzza', 1]
['arepa', 1]
['pizza', 4]
['Rice', 1]
['110790', 1]
['Cake', 1]
['null', 1]
['pasta', 2]



In [69]:

    
print(kv_dict)









    



{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}



In [46]:

    
a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-d4821b29c8c9> in <module>
----> 1 a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()

AttributeError: 'str' object has no attribute 'cols'



In [47]:

    
a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-47-fb2466275319> in <module>
----> 1 a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()

AttributeError: 'str' object has no attribute 'rows'



In [12]:



In [ ]:



In [ ]:

id 1 (int) nullable	firstName 2 (string) nullable	lastName 3 (string) nullable	billingId 4 (int) nullable	product 5 (string) nullable	price 6 (int) nullable	birth 7 (string) nullable	dummyCol 8 (string) nullable
1	Luis	Alvarez$$%!	123	Cake	10	1980/07/07	never
2	André	Ampère	423	piza	8	1950/07/08	gonna
3	NiELS	Böhr//((%%	551	pizza	8	1990/07/09	give
4	PAUL	dirac$	521	pizza	8	1954/07/10	you
5	Albert	Einstein	634	pizza	8	1990/07/11	up
6	Galileo	⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI	672	arepa	5	1930/08/12	never
7	CaRL	Ga%%%uss	323	taco	3	1970/07/13	gonna
8	David	H$$$ilbert	624	taaaccoo	3	1950/07/14	let
9	Johannes	KEPLER	735	taco	3	1920/04/22	you
10	JaMES	M$$ax%%well	875	taco	3	1923/03/12	down
11	Isaac	Newton	992	pasta	9	1999/02/15	never⋅
12	Emmy%%	Nöether$	234	pasta	9	1993/12/08	gonna
13	Max!!!	Planck!!!	111	hamburguer	4	1994/01/04	run⋅
14	Fred	Hoy&&≤	553	pizzza	8	1997/06/27	around
15	(((⋅⋅⋅Heinrich⋅)))))	Hertz	116	pizza	8	1956/11/30	and
16	William	Gilbert###	886	BEER	2	1958/03/26	desert
17	Marie	CURIE	912	Rice	1	2000/03/22	you
18	Arthur	COM%%%pton	812	110790	5	1899/01/01	#
19	JAMES	Chadwick	467	null	10	1921/05/03	#

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
1⋅Megatron	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optimus⋅OptimusPrime	28	redaeL	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	eganoipsE	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	ytiruceS	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
1⋅Megatron	13	tnanetueiL⋅tsriF	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
1⋅Megatron	None	enoN	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
megatron⋅1	300	noitatS⋅elttaB	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable	names***FINGERPRINT 17 (string) nullable
Optimus⋅OptimusPrime	28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None	optimusoptimusprime
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None	bumblebee
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None	ironhide
1⋅Megatron	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None	1megatron
1⋅Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None	1megatron
megatron⋅1	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None	1megatron

count 1 (string) not nullable	names 2 (string) nullable	names***NGRAM 3 (array<string>) not nullable	names***NGRAM_FINGERPRINT 4 (string) nullable
1	bumbl#ebéé⋅⋅	['bumblebee']	bumblebee
1	ironhide&	['ironhide']	ironhide
1	Megatron2	['megatron2']	megatron2
1	Optimus⋅OptimusPrime	['optimusoptimusprime']	optimusoptimusprime
1	Megatron1	['megatron1']	megatron1
1	Megatron	['megatron']	megatron

count 1 (string) not nullable	product 2 (string) nullable	product***NGRAM 3 (array<string>) not nullable	product***NGRAM_FINGERPRINT 4 (string) nullable
1	taaaccoo	['taaaccoo']	taaaccoo
1	piza	['piza']	piza
1	hamburguer	['hamburguer']	hamburguer
3	taco	['taco']	taco
1	BEER	['beer']	beer
1	pizzza	['pizzza']	pizzza
1	arepa	['arepa']	arepa
4	pizza	['pizza']	pizza
1	Rice	['rice']	rice
1	110790	['110790']	110790