In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import sys
sys.path.append("..")
In [3]:
from infer import Infer
Infer.value(12, "string")
Out[3]:
False
In [3]:
from optimus.optimus import Optimus
You are using PySparkling of version 2.4.10, but your PySpark is of
version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
In [5]:
# from optimus import parse
In [6]:
# from optimus.bumblebee import Comm
# comm = Comm("this_is_the_queue_name")
In [5]:
op= Optimus(comm=True)
Open Bumblebee: https://app.hi-bumblebee.comIf you really care about privacy get your keys in bumblebee.ini and put them here
C:/Users/argenisleon/Documents/Optimus/optimus/../infer.py
In [8]:
from pyspark.sql.types import *
from datetime import date, datetime
cols = [
("names", "str"),
("height(ft)", ShortType()),
("function", "str"),
("rank", ByteType()),
("age", "int"),
("weight(t)", "float"),
"japanese name",
"last position seen",
"date arrival",
"last date seen",
("attributes", ArrayType(FloatType())),
("DateType", DateType()),
("timestamp", TimestampType()),
("Cybertronian", BooleanType()),
("function(binary)", BinaryType()),
("NullType", NullType())
]
rows = [
("argenisleon@gmail.com", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("1 Megatron", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("1 Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
(None, 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
]
df = op.create.df(cols ,rows, False).cache().repartition(1)
In [19]:
df.table(20)
Viewing 19 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
6
Galileo
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
672
arepa
5
1930/08/12
never
7
CaRL
Ga%%%uss
323
taco
3
1970/07/13
gonna
8
David
H$$$ilbert
624
taaaccoo
3
1950/07/14
let
9
Johannes
KEPLER
735
taco
3
1920/04/22
you
10
JaMES
M$$ax%%well
875
taco
3
1923/03/12
down
11
Isaac
Newton
992
pasta
9
1999/02/15
never⋅
12
Emmy%%
Nöether$
234
pasta
9
1993/12/08
gonna
13
Max!!!
Planck!!!
111
hamburguer
4
1994/01/04
run⋅
14
Fred
Hoy&&≤
553
pizzza
8
1997/06/27
around
15
(((⋅⋅⋅Heinrich⋅)))))
Hertz
116
pizza
8
1956/11/30
and
16
William
Gilbert###
886
BEER
2
1958/03/26
desert
17
Marie
CURIE
912
Rice
1
2000/03/22
you
18
Arthur
COM%%%pton
812
110790
5
1899/01/01
#
19
JAMES
Chadwick
467
null
10
1921/05/03
#
Viewing 19 of 19 rows / 8 columns
1 partition(s)
In [6]:
df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")
In [7]:
outlier = df.outliers.mad("price", threshold = 1)
In [8]:
outlier.hist("price")
Out[8]:
'{"price": {"hist": [{"count": 6.0, "lower": 8.0, "upper": 8.1}, {"count": 0.0, "lower": 8.1, "upper": 8.2}, {"count": 0.0, "lower": 8.2, "upper": 8.3}, {"count": 0.0, "lower": 8.3, "upper": 8.4}, {"count": 0.0, "lower": 8.4, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 8.6}, {"count": 0.0, "lower": 8.6, "upper": 8.7}, {"count": 0.0, "lower": 8.7, "upper": 8.8}, {"count": 0.0, "lower": 8.8, "upper": 8.9}, {"count": 0.0, "lower": 8.9, "upper": 9.0}, {"count": 2.0, "lower": 9.0, "upper": 9.1}, {"count": 0.0, "lower": 9.1, "upper": 9.2}, {"count": 0.0, "lower": 9.2, "upper": 9.3}, {"count": 0.0, "lower": 9.3, "upper": 9.4}, {"count": 0.0, "lower": 9.4, "upper": 9.5}, {"count": 0.0, "lower": 9.5, "upper": 9.6}, {"count": 0.0, "lower": 9.6, "upper": 9.7}, {"count": 0.0, "lower": 9.7, "upper": 9.8}, {"count": 0.0, "lower": 9.8, "upper": 9.9}, {"count": 0.0, "lower": 9.9, "upper": 10.0}]}}'
In [12]:
df.cols.count_by_dtypes("id")
Out[12]:
{'id': {'null': 0, 'missing': 0, 'int': 19}}
In [22]:
df.count()
Out[22]:
19
In [24]:
outlier.info()
6
Out[24]:
{'count_outliers': 9,
'count_non_outliers': 10,
'lower_bound': 6,
'lower_bound_count': 9,
'upper_bound': 10,
'upper_bound_count': 0}
In [11]:
# df.table()
In [12]:
df.cols.count_mismatch({"names":"argenisleon@gmail.com","names":"email"})
Out[12]:
{'names': {'email': 1, 'mismatch': 4, 'null': 1, 'missing': 0}}
In [14]:
a = {'names': {'email': 1, 'mismatch': 4, 'null': 1}}
In [15]:
tuple({"firstName":"string","lastName":"array"}.values())
Out[15]:
('string', 'array')
In [16]:
from infer import Infer
In [17]:
from infer import Infer
Infer.mismatch(("names",None),{"names":"email"})
Out[17]:
(('names', 'null'), 1)
In [20]:
Infer.value(12, "string")
In [36]:
list({"firstName":"string","lastName":"string"}.keys())
Out[36]:
['firstName', 'lastName']
In [8]:
df.rows.select_by_dtypes("names","str")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-8-5a6988a57346> in <module>
----> 1 df.rows.select_by_dtypes("names","str")
NameError: name 'df' is not defined
In [117]:
# Histograma
df.rows.between("height(ft)",17,26, invert = False , equal =True, ).table()
Viewing 3 of 3 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
1⋅Megatron
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Viewing 3 of 3 rows / 16 columns
1 partition(s)
In [ ]:
In [55]:
df.cols.reverse("function").table()
Viewing 6 of 6 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus⋅OptimusPrime
28
redaeL
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
eganoipsE
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
ytiruceS
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
1⋅Megatron
13
tnanetueiL⋅tsriF
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
1⋅Megatron
None
enoN
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
megatron⋅1
300
noitatS⋅elttaB
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
Viewing 6 of 6 rows / 16 columns
1 partition(s)
In [20]:
outlier = df.outliers.tukey("mass (g)")
In [28]:
# print(outlier.info())
outlier.select_lower_bound()
Out[28]:
'{"columns": [{"title": "mass (g)"}], "value": [[21.0], [160.0], [252.0], [256.8], [320.0], [41.0], [94.2], [265.0], [146.0], [134.0], [345.0], [14.0], [23.2], [17.0], [375.0], [270.0], [13.9], [18.0], [100.0], [488.1], [470.0], [67.8], [56.0], [190.0], [219.0], [324.0], [357.0], [212.0], [478.0], [342.0], [8.0], [94.0], [45.6], [0.5], [72.0], [367.0], [303.0], [48.6], [469.0], [78.4], [167.0], [100.0], [340.0], [28.0], [0.8], [230.0], [400.0], [438.0], [230.0], [30.0], [300.0], [188.0], [127.0], [277.0], [113.0], [107.2], [380.0], [82.0], [220.0], [240.0], [132.7], [36.1], [28.0], [380.0], [102.0], [480.0], [45.5], [215.0], [288.0], [28.0], [0.2], [315.0], [414.0], [167.7], [305.5], [180.0], [266.1], [112.0], [22.0], [450.0], [222.0], [100.0], [30.0], [483.0], [89.0], [230.0], [350.0], [448.0], [299.0], [400.0], [180.0], [450.0], [100.0], [331.0], [195.0], [140.0], [67.4], [97.7], [202.6], [136.0]]}'
In [256]:
keyCol.fingerprint(df,"product").table()
Viewing 10 of 19 rows / 9 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
product***FINGERPRINT
9 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
cake
2
André
Ampère
423
piza
8
1950/07/08
gonna
piza
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
pizza
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
pizza
5
Albert
Einstein
634
pizza
8
1990/07/11
up
pizza
6
Galileo
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
672
arepa
5
1930/08/12
never
arepa
7
CaRL
Ga%%%uss
323
taco
3
1970/07/13
gonna
taco
8
David
H$$$ilbert
624
taaaccoo
3
1950/07/14
let
taaaccoo
9
Johannes
KEPLER
735
taco
3
1920/04/22
you
taco
10
JaMES
M$$ax%%well
875
taco
3
1923/03/12
down
taco
Viewing 10 of 19 rows / 9 columns
1 partition(s)
In [245]:
keyCol.fingerprint(df,"names").table()
Viewing 6 of 6 rows / 17 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
names***FINGERPRINT
17 (string)
nullable
Optimus⋅OptimusPrime
28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
optimusoptimusprime
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
bumblebee
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
ironhide
1⋅Megatron
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
1megatron
1⋅Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
1megatron
megatron⋅1
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
1megatron
Viewing 6 of 6 rows / 17 columns
1 partition(s)
In [259]:
keyCol.fingerprint_cluster(df,"product", output="json")
Out[259]:
'{"taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "pizzza": {"similar": {"pizzza": 1}, "count": 1, "sum": 1}, "arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "pizza": {"similar": {"pizza": 4}, "count": 1, "sum": 4}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}}'
In [261]:
keyCol.n_gram_fingerprint_cluster(df,"product", output="json",n_size=2)
Out[261]:
'{"arepa": {"similar": {"arepa": 1}, "count": 1, "sum": 1}, "taaaccoo": {"similar": {"taaaccoo": 1}, "count": 1, "sum": 1}, "pasta": {"similar": {"pasta": 2}, "count": 1, "sum": 2}, "pizza": {"similar": {"pizzza": 1, "pizza": 4}, "count": 2, "sum": 5}, "110790": {"similar": {"110790": 1}, "count": 1, "sum": 1}, "hamburguer": {"similar": {"hamburguer": 1}, "count": 1, "sum": 1}, "taco": {"similar": {"taco": 3}, "count": 1, "sum": 3}, "Cake": {"similar": {"Cake": 1}, "count": 1, "sum": 1}, "Rice": {"similar": {"Rice": 1}, "count": 1, "sum": 1}, "piza": {"similar": {"piza": 1}, "count": 1, "sum": 1}, "null": {"similar": {"null": 1}, "count": 1, "sum": 1}, "BEER": {"similar": {"BEER": 1}, "count": 1, "sum": 1}}'
In [7]:
from optimus.ml import keycollision as keyCol
from optimus.ml import distancecluster as dc
In [258]:
dc.levenshtein_cluster(df,"product", output="json")
Out[258]:
'{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}'
In [31]:
keyCol.n_gram_fingerprint_cluster(df,"names", n_size=1,output="json")
Viewing 6 of 6 rows / 4 columns
1 partition(s)
count
1 (string)
not nullable
names
2 (string)
nullable
names***NGRAM
3 (array<string>)
not nullable
names***NGRAM_FINGERPRINT
4 (string)
nullable
1
bumbl#ebéé⋅⋅
['bumblebee']
bumblebee
1
ironhide&
['ironhide']
ironhide
1
Megatron2
['megatron2']
megatron2
1
Optimus⋅OptimusPrime
['optimusoptimusprime']
optimusoptimusprime
1
Megatron1
['megatron1']
megatron1
1
Megatron
['megatron']
megatron
Viewing 6 of 6 rows / 4 columns
1 partition(s)
Out[31]:
'{"ironhide&": {"similar": {"ironhide&": 1}, "count": 1, "sum": 1.0}, "Megatron1": {"similar": {"Megatron1": 1}, "count": 1, "sum": 1.0}, "Optimus OptimusPrime": {"similar": {"Optimus OptimusPrime": 1}, "count": 1, "sum": 1.0}, "Megatron": {"similar": {"Megatron": 1}, "count": 1, "sum": 1.0}, "bumbl#eb\\u00e9\\u00e9 ": {"similar": {"bumbl#eb\\u00e9\\u00e9 ": 1}, "count": 1, "sum": 1.0}, "Megatron2": {"similar": {"Megatron2": 1}, "count": 1, "sum": 1.0}}'
In [25]:
df.table()
Viewing 6 of 6 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus⋅OptimusPrime
28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
Megatron1
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
megatron
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
Viewing 6 of 6 rows / 16 columns
1 partition(s)
In [81]:
# df = op.load.csv("data/foo.csv", sep=",", header='true', infer_schema='true', charset="UTF-8", null_value="None")
In [82]:
df.table()
Viewing 6 of 6 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus⋅OptimusPrime
28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
JaJa⋅JaJaJ
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
Viewing 6 of 6 rows / 16 columns
1 partition(s)
In [95]:
df.cols.replace("names",["JaJa","bbb"],"aaa",search_by="words").table()
Viewing 6 of 6 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus⋅OptimusPrime
28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
aaa⋅JaJaJ
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
Viewing 6 of 6 rows / 16 columns
1 partition(s)
In [20]:
df.send()
Send!
In [7]:
df.table(20)
Out[7]:
Viewing 19 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
6
Galileo
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
672
arepa
5
1930/08/12
never
7
CaRL
Ga%%%uss
323
taco
3
1970/07/13
gonna
8
David
H$$$ilbert
624
taaaccoo
3
1950/07/14
let
9
Johannes
KEPLER
735
taco
3
1920/04/22
you
10
JaMES
M$$ax%%well
875
taco
3
1923/03/12
down
11
Isaac
Newton
992
pasta
9
1999/02/15
never⋅
12
Emmy%%
Nöether$
234
pasta
9
1993/12/08
gonna
13
Max!!!
Planck!!!
111
hamburguer
4
1994/01/04
run⋅
14
Fred
Hoy&&≤
553
pizzza
8
1997/06/27
around
15
(((⋅⋅⋅Heinrich⋅)))))
Hertz
116
pizza
8
1956/11/30
and
16
William
Gilbert###
886
BEER
2
1958/03/26
desert
17
Marie
CURIE
912
Rice
1
2000/03/22
you
18
Arthur
COM%%%pton
812
110790
5
1899/01/01
#
19
JAMES
Chadwick
467
null
10
1921/05/03
#
Viewing 19 of 19 rows / 8 columns
1 partition(s)
In [10]:
df.outliers.z_score("price",threshold =1).info()
Out[10]:
{'count_outliers': 8, 'count_non_outliers': 11, 'max_z_score': 1.7111}
In [8]:
df.outliers.tukey("price").info()
Out[8]:
{'count_outliers': 0,
'count_non_outliers': 19,
'lower_bound': -4.5,
'lower_bound_count': 0,
'upper_bound': 15.5,
'upper_bound_count': 0,
'iqr1': 3,
'iqr3': 8}
In [9]:
df.outliers.mad("price", threshold =1).info()
Out[9]:
{'count_outliers': 9,
'count_non_outliers': 19,
'lower_bound': 6,
'lower_bound_count': 9,
'upper_bound': 10,
'upper_bound_count': 0}
In [11]:
df.outliers.modified_z_score("price",threshold =1).info()
Out[11]:
{'count_outliers': 19, 'count_non_outliers': 19, 'max_m_z_score': 2.36075}
In [47]:
%%time
from optimus.ml import distancecluster as dc
print(dc.levenshtein_cluster(df,'product',output="json"))
{"taaaccoo": {"similar": {"taco": 3, "taaaccoo": 1}, "count": 2, "sum": 4}, "piza": {"similar": {"pizza": 4, "piza": 1}, "count": 2, "sum": 5}, "hamburguer": {"similar": {"BEER": 1, "hamburguer": 1}, "count": 2, "sum": 2}, "taco": {"similar": {"Cake": 1, "Rice": 1, "taco": 3}, "count": 3, "sum": 5}, "pizzza": {"similar": {"pizza": 4, "pizzza": 1}, "count": 2, "sum": 5}, "arepa": {"similar": {"BEER": 1, "piza": 1, "pasta": 2, "Cake": 1, "Rice": 1, "pizza": 4, "arepa": 1}, "count": 7, "sum": 11}, "pizza": {"similar": {"piza": 1, "pizzza": 1, "pizza": 4}, "count": 3, "sum": 6}, "Rice": {"similar": {"piza": 1, "Cake": 1, "taco": 3, "Rice": 1}, "count": 4, "sum": 6}, "110790": {"similar": {"arepa": 1, "BEER": 1, "piza": 1, "pizzza": 1, "pasta": 2, "Cake": 1, "null": 1, "Rice": 1, "pizza": 4, "taco": 3, "110790": 1}, "count": 11, "sum": 17}, "BEER": {"similar": {"arepa": 1, "piza": 1, "Cake": 1, "null": 1, "Rice": 1, "taco": 3, "BEER": 1}, "count": 7, "sum": 9}, "Cake": {"similar": {"Rice": 1, "taco": 3, "Cake": 1}, "count": 3, "sum": 5}, "null": {"similar": {"BEER": 1, "piza": 1, "Cake": 1, "Rice": 1, "taco": 3, "null": 1}, "count": 6, "sum": 8}, "pasta": {"similar": {"piza": 1, "pizza": 4, "pasta": 2}, "count": 3, "sum": 7}}
Wall time: 9.6 s
In [51]:
from optimus.ml import distancecluster as dc
from optimus.ml import keycollision as kc
# result = dc.levenshtein_json(df,'product')
result = kc.fingerprint_cluster(df, "product",3)
In [62]:
result = kc.n_gram_fingerprint_cluster(df, "product",3)
Viewing 10 of 13 rows / 4 columns
1 partition(s)
count
1 (string)
not nullable
product
2 (string)
nullable
product***NGRAM
3 (array<string>)
not nullable
product***NGRAM_FINGERPRINT
4 (string)
nullable
1
taaaccoo
['taaaccoo']
taaaccoo
1
piza
['piza']
piza
1
hamburguer
['hamburguer']
hamburguer
3
taco
['taco']
taco
1
BEER
['beer']
beer
1
pizzza
['pizzza']
pizzza
1
arepa
['arepa']
arepa
4
pizza
['pizza']
pizza
1
Rice
['rice']
rice
1
110790
['110790']
110790
Viewing 10 of 13 rows / 4 columns
1 partition(s)
In [63]:
print(result)
{'taaaccoo': {'similar': ['taaaccoo'], 'count': 1, 'sum': 1.0}, 'piza': {'similar': ['piza'], 'count': 1, 'sum': 1.0}, 'hamburguer': {'similar': ['hamburguer'], 'count': 1, 'sum': 1.0}, 'taco': {'similar': ['taco'], 'count': 1, 'sum': 3.0}, 'pizzza': {'similar': ['pizzza'], 'count': 1, 'sum': 1.0}, 'arepa': {'similar': ['arepa'], 'count': 1, 'sum': 1.0}, 'pizza': {'similar': ['pizza'], 'count': 1, 'sum': 4.0}, 'Rice': {'similar': ['Rice'], 'count': 1, 'sum': 1.0}, '110790': {'similar': ['110790'], 'count': 1, 'sum': 1.0}, 'BEER': {'similar': ['BEER'], 'count': 1, 'sum': 1.0}, 'Cake': {'similar': ['Cake'], 'count': 1, 'sum': 1.0}, 'null': {'similar': ['null'], 'count': 1, 'sum': 1.0}, 'pasta': {'similar': ['pasta'], 'count': 1, 'sum': 2.0}}
In [159]:
type(result)
Out[159]:
str
In [68]:
kv_dict ={}
for row in result.collect():
_row = list(row.asDict().values())
print(_row)
kv_dict[_row[0]] = _row[1]
['taaaccoo', 1]
['piza', 1]
['hamburguer', 1]
['taco', 3]
['BEER', 1]
['pizzza', 1]
['arepa', 1]
['pizza', 4]
['Rice', 1]
['110790', 1]
['Cake', 1]
['null', 1]
['pasta', 2]
In [69]:
print(kv_dict)
{'taaaccoo': 1, 'piza': 1, 'hamburguer': 1, 'taco': 3, 'BEER': 1, 'pizzza': 1, 'arepa': 1, 'pizza': 4, 'Rice': 1, '110790': 1, 'Cake': 1, 'null': 1, 'pasta': 2}
In [46]:
a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-d4821b29c8c9> in <module>
----> 1 a.cols.replace("product***LEVENSHTEIN_DISTANCE", 0, None).table()
AttributeError: 'str' object has no attribute 'cols'
In [47]:
a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-47-fb2466275319> in <module>
----> 1 a.rows.drop(where=((a["product_LEVENSHTEIN_1"]!=a["product_LEVENSHTEIN_2"])& (a["product***LEVENSHTEIN_DISTANCE"]==0))).table()
AttributeError: 'str' object has no attribute 'rows'
In [12]:
In [ ]:
In [ ]:
Content source: ironmussa/Optimus
Similar notebooks: