In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from optimus import Optimus

# Create optimus
op = Optimus()


C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  from collections import Callable

    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible. 

In [23]:
df = op.read.csv("data/random.csv",header=True, sep=";")

In [2]:
df.table()


Viewing 10 of 15560 rows / 136 columns
3 partition(s)
LOCNCODE
1 (string)
nullable
LOCNDSCR
2 (string)
nullable
ADDRESS1
3 (string)
nullable
ADDRESS2
4 (string)
nullable
ADDRESS3
5 (string)
nullable
CITY
6 (string)
nullable
STATE
7 (string)
nullable
ZIPCODE
8 (string)
nullable
COUNTRY
9 (string)
nullable
Location_Segment
10 (string)
nullable
PAQ
11 (string)
nullable
TIPUNI
12 (string)
nullable
Tipo_unidad
13 (string)
nullable
ITEMNMBR
14 (string)
nullable
ITMSHNAM
15 (string)
nullable
MZ
16 (string)
nullable
LT
17 (string)
nullable
EDIF
18 (string)
nullable
NIVEL
19 (string)
nullable
NOUNI
20 (string)
nullable
CONDO
21 (string)
nullable
REGIMEN
22 (string)
nullable
ETAPA
23 (string)
nullable
PROTO
24 (string)
nullable
ITEMDESC
25 (string)
nullable
NIVELES
26 (string)
nullable
COCHERA
27 (string)
nullable
RECAM
28 (string)
nullable
ALCOB
29 (string)
nullable
BANOS
30 (string)
nullable
Num_Balcon
31 (string)
nullable
SALA
32 (string)
nullable
COMEDOR
33 (string)
nullable
COCINA
34 (string)
nullable
Cuarto_Lavado
35 (string)
nullable
Cuarto_Servicio
36 (string)
nullable
OTROX
37 (string)
nullable
OTROX1
38 (string)
nullable
SupCons
39 (string)
nullable
PATIOSERV
40 (string)
nullable
TERRAZA
41 (string)
nullable
BALCON
42 (string)
nullable
AZOTEA
43 (string)
nullable
Otros
44 (string)
nullable
AREATOT
45 (string)
nullable
FRENTE
46 (string)
nullable
Sup_Terreno
47 (string)
nullable
EXCEDENTE
48 (string)
nullable
OTRO1
49 (string)
nullable
OTRO2
50 (string)
nullable
TAMANO
51 (string)
nullable
UBICAVER
52 (string)
nullable
UBICAHORI
53 (string)
nullable
QTYONHND_
54 (string)
nullable
QTYSOLD
55 (string)
nullable
INACTIVE
56 (string)
nullable
UOMPRICE
57 (string)
nullable
MONTOAPA
58 (string)
nullable
PAGINI
59 (string)
nullable
ENGANCHE
60 (string)
nullable
FECHESCRIPRO
61 (string)
nullable
FECHAENTREGA
62 (string)
nullable
FECHASALIDAVENTAS
63 (string)
nullable
LIBERADO_NOLIBERADO
64 (string)
nullable
ACTIVO_INACTIVO
65 (string)
nullable
Estatus1Vivienda
66 (string)
nullable
Estatus2Vivienda
67 (string)
nullable
CUSTNMBR
68 (string)
nullable
Nombre_Completo
69 (string)
nullable
cNombre
70 (string)
nullable
cApellidoPaterno
71 (string)
nullable
cApellidoMaterno
72 (string)
nullable
cRfc
73 (string)
nullable
cCurp
74 (string)
nullable
fkIdGradoInteres
75 (string)
nullable
cSexo
76 (string)
nullable
cEmail
77 (string)
nullable
cTelefonoCasa
78 (string)
nullable
cTelefonoCelular
79 (string)
nullable
cTelefonoTrabajo
80 (string)
nullable
cNumeroSeguroSocial
81 (string)
nullable
dFechaNacimiento
82 (string)
nullable
cEstadoCivil
83 (string)
nullable
cRegimenConyugal
84 (string)
nullable
cNacionalidad
85 (string)
nullable
cLugarNacimiento
86 (string)
nullable
cRecomendadoPor
87 (string)
nullable
fkIdMedio
88 (string)
nullable
cMedioContacto
89 (string)
nullable
cCalle
90 (string)
nullable
cNumeroExterior
91 (string)
nullable
cNumeroInterior
92 (string)
nullable
cColonia
93 (string)
nullable
cMunicipio
94 (string)
nullable
cEstado
95 (string)
nullable
cPais
96 (string)
nullable
cCodigoPostal
97 (string)
nullable
nTiempoResidencia
98 (string)
nullable
cComentario
99 (string)
nullable
cNumeroIdentificacion
100 (string)
nullable
cTipoIdentificación
101 (string)
nullable
REFERENCIA
102 (string)
nullable
FACTURA
103 (string)
nullable
NOTACR
104 (string)
nullable
Precio_cierre
105 (string)
nullable
Precio_cierre_Tot
106 (string)
nullable
Aumento_al_Contrato
107 (string)
nullable
Condonacón
108 (string)
nullable
Precio_Escritura_Total
109 (string)
nullable
Precio_Dev
110 (string)
nullable
Precio_Dev_Total
111 (string)
nullable
Notarios_Proyectados
112 (string)
nullable
Gatos_A_terceros
113 (string)
nullable
Depositos
114 (string)
nullable
Saldo
115 (string)
nullable
dFechaCreacion
116 (string)
nullable
dFechaModificacion
117 (string)
nullable
FECHA_Cotizado
118 (string)
nullable
FECHA_SolApartado
119 (string)
nullable
FECHA_AutApartado
120 (string)
nullable
Vigencia_Apartado
121 (string)
nullable
FechaVencimientoApartado
122 (string)
nullable
FECHA_SolDictamen
123 (string)
nullable
FECHA_ProcDictamen
124 (string)
nullable
FECHA_DictaminadoLlamada
125 (string)
nullable
FECHA_DictaminadoFirma
126 (string)
nullable
FECHA_Dictaminado
127 (string)
nullable
FECHA_Rechazado
128 (string)
nullable
FECHA_EscrituraAvaluo
129 (string)
nullable
FECHA_EscrituraFolio
130 (string)
nullable
FolioEscsritura
131 (string)
nullable
FECHA_EscrituraReal
132 (string)
nullable
FECHA_Cancelado
133 (string)
nullable
FECHA_Liberado
134 (string)
nullable
FECHA_Entregado
135 (string)
nullable
MotivoCancelacion
136 (string)
nullable
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV008
ALVCDEY0080
None
None
None
None
008
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV021
ALVCDEY0690
None
None
None
None
069
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV022
ALVCDEY0710
None
None
None
None
071
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV027
ALVCDEY0810
None
None
None
None
081
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV032
ALVCEEY0090
None
None
None
None
009
None
0
0
EST⋅CEEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV035
ALVCEEY0150
None
None
None
None
015
None
0
0
EST⋅CEEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV009
ALVCDEY0100
None
None
None
None
010
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV012
ALVCDEY0160
None
None
None
None
016
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV019
ALVCDEY0650
None
None
None
None
065
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV044
ALVCUEY0340
None
None
None
None
034
None
0
0
EST⋅CUEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Viewing 10 of 15560 rows / 136 columns
3 partition(s)

Key Collision


In [3]:
from optimus.ml import keycollision as keyCol

# op.spark.conf.set("spark.sql.shuffle.partitions", "4")
# op.spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 1 * 1024 * 1024 * 1024)

keyCol.fingerprint(df, 'STATE').table()


Viewing 10 of 15560 rows / 137 columns
1 partition(s)
LOCNCODE
1 (string)
nullable
LOCNDSCR
2 (string)
nullable
ADDRESS1
3 (string)
nullable
ADDRESS2
4 (string)
nullable
ADDRESS3
5 (string)
nullable
CITY
6 (string)
nullable
STATE
7 (string)
nullable
ZIPCODE
8 (string)
nullable
COUNTRY
9 (string)
nullable
Location_Segment
10 (string)
nullable
PAQ
11 (string)
nullable
TIPUNI
12 (string)
nullable
Tipo_unidad
13 (string)
nullable
ITEMNMBR
14 (string)
nullable
ITMSHNAM
15 (string)
nullable
MZ
16 (string)
nullable
LT
17 (string)
nullable
EDIF
18 (string)
nullable
NIVEL
19 (string)
nullable
NOUNI
20 (string)
nullable
CONDO
21 (string)
nullable
REGIMEN
22 (string)
nullable
ETAPA
23 (string)
nullable
PROTO
24 (string)
nullable
ITEMDESC
25 (string)
nullable
NIVELES
26 (string)
nullable
COCHERA
27 (string)
nullable
RECAM
28 (string)
nullable
ALCOB
29 (string)
nullable
BANOS
30 (string)
nullable
Num_Balcon
31 (string)
nullable
SALA
32 (string)
nullable
COMEDOR
33 (string)
nullable
COCINA
34 (string)
nullable
Cuarto_Lavado
35 (string)
nullable
Cuarto_Servicio
36 (string)
nullable
OTROX
37 (string)
nullable
OTROX1
38 (string)
nullable
SupCons
39 (string)
nullable
PATIOSERV
40 (string)
nullable
TERRAZA
41 (string)
nullable
BALCON
42 (string)
nullable
AZOTEA
43 (string)
nullable
Otros
44 (string)
nullable
AREATOT
45 (string)
nullable
FRENTE
46 (string)
nullable
Sup_Terreno
47 (string)
nullable
EXCEDENTE
48 (string)
nullable
OTRO1
49 (string)
nullable
OTRO2
50 (string)
nullable
TAMANO
51 (string)
nullable
UBICAVER
52 (string)
nullable
UBICAHORI
53 (string)
nullable
QTYONHND_
54 (string)
nullable
QTYSOLD
55 (string)
nullable
INACTIVE
56 (string)
nullable
UOMPRICE
57 (string)
nullable
MONTOAPA
58 (string)
nullable
PAGINI
59 (string)
nullable
ENGANCHE
60 (string)
nullable
FECHESCRIPRO
61 (string)
nullable
FECHAENTREGA
62 (string)
nullable
FECHASALIDAVENTAS
63 (string)
nullable
LIBERADO_NOLIBERADO
64 (string)
nullable
ACTIVO_INACTIVO
65 (string)
nullable
Estatus1Vivienda
66 (string)
nullable
Estatus2Vivienda
67 (string)
nullable
CUSTNMBR
68 (string)
nullable
Nombre_Completo
69 (string)
nullable
cNombre
70 (string)
nullable
cApellidoPaterno
71 (string)
nullable
cApellidoMaterno
72 (string)
nullable
cRfc
73 (string)
nullable
cCurp
74 (string)
nullable
fkIdGradoInteres
75 (string)
nullable
cSexo
76 (string)
nullable
cEmail
77 (string)
nullable
cTelefonoCasa
78 (string)
nullable
cTelefonoCelular
79 (string)
nullable
cTelefonoTrabajo
80 (string)
nullable
cNumeroSeguroSocial
81 (string)
nullable
dFechaNacimiento
82 (string)
nullable
cEstadoCivil
83 (string)
nullable
cRegimenConyugal
84 (string)
nullable
cNacionalidad
85 (string)
nullable
cLugarNacimiento
86 (string)
nullable
cRecomendadoPor
87 (string)
nullable
fkIdMedio
88 (string)
nullable
cMedioContacto
89 (string)
nullable
cCalle
90 (string)
nullable
cNumeroExterior
91 (string)
nullable
cNumeroInterior
92 (string)
nullable
cColonia
93 (string)
nullable
cMunicipio
94 (string)
nullable
cEstado
95 (string)
nullable
cPais
96 (string)
nullable
cCodigoPostal
97 (string)
nullable
nTiempoResidencia
98 (string)
nullable
cComentario
99 (string)
nullable
cNumeroIdentificacion
100 (string)
nullable
cTipoIdentificación
101 (string)
nullable
REFERENCIA
102 (string)
nullable
FACTURA
103 (string)
nullable
NOTACR
104 (string)
nullable
Precio_cierre
105 (string)
nullable
Precio_cierre_Tot
106 (string)
nullable
Aumento_al_Contrato
107 (string)
nullable
Condonacón
108 (string)
nullable
Precio_Escritura_Total
109 (string)
nullable
Precio_Dev
110 (string)
nullable
Precio_Dev_Total
111 (string)
nullable
Notarios_Proyectados
112 (string)
nullable
Gatos_A_terceros
113 (string)
nullable
Depositos
114 (string)
nullable
Saldo
115 (string)
nullable
dFechaCreacion
116 (string)
nullable
dFechaModificacion
117 (string)
nullable
FECHA_Cotizado
118 (string)
nullable
FECHA_SolApartado
119 (string)
nullable
FECHA_AutApartado
120 (string)
nullable
Vigencia_Apartado
121 (string)
nullable
FechaVencimientoApartado
122 (string)
nullable
FECHA_SolDictamen
123 (string)
nullable
FECHA_ProcDictamen
124 (string)
nullable
FECHA_DictaminadoLlamada
125 (string)
nullable
FECHA_DictaminadoFirma
126 (string)
nullable
FECHA_Dictaminado
127 (string)
nullable
FECHA_Rechazado
128 (string)
nullable
FECHA_EscrituraAvaluo
129 (string)
nullable
FECHA_EscrituraFolio
130 (string)
nullable
FolioEscsritura
131 (string)
nullable
FECHA_EscrituraReal
132 (string)
nullable
FECHA_Cancelado
133 (string)
nullable
FECHA_Liberado
134 (string)
nullable
FECHA_Entregado
135 (string)
nullable
MotivoCancelacion
136 (string)
nullable
STATE_FINGERPRINT
137 (string)
nullable
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV008
ALVCDEY0080
None
None
None
None
008
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV021
ALVCDEY0690
None
None
None
None
069
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV022
ALVCDEY0710
None
None
None
None
071
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV027
ALVCDEY0810
None
None
None
None
081
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV032
ALVCEEY0090
None
None
None
None
009
None
0
0
EST⋅CEEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV035
ALVCEEY0150
None
None
None
None
015
None
0
0
EST⋅CEEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV009
ALVCDEY0100
None
None
None
None
010
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV012
ALVCDEY0160
None
None
None
None
016
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV019
ALVCDEY0650
None
None
None
None
065
None
0
0
EST⋅CDEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
ALV
Altos⋅Lindavista
Guanajuato⋅#⋅85
None
San⋅Bartolo⋅Atepehuacan
Gustavo⋅A.⋅Madero
Distrito⋅Federal
07730
Mexico
0531
None
2
ESTACIONAMIENTO
ALVV044
ALVCUEY0340
None
None
None
None
034
None
0
0
EST⋅CUEY
Cajon⋅virtual
None
None
None
None
None
None
None
None
None
None
None
None
None
2.2
None
None
None
None
None
None
2.4
None
0
None
None
Chico
Cajon⋅virtual
Cajon⋅virtual
0
0
1
0
None
None
None
None
None
None
NO⋅LIBERADO
INACTIVO
DISPONIBLE
000-DISPONIBLE
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
.00000
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
distritofederal
Viewing 10 of 15560 rows / 137 columns
1 partition(s)

In [4]:
keyCol.fingerprint_cluster(df, 'STATE').table()


Viewing 5 of 5 rows / 4 columns
200 partition(s)
STATE_CLUSTER_SIZE
1 (int)
not nullable
STATE_CLUSTER
2 (array<string>)
nullable
STATE_COUNT
3 (bigint)
nullable
STATE_RECOMMENDED
4 (string)
nullable
1
['Estado⋅de⋅México']
810
Estado⋅de⋅México
2
['México⋅D.F.',⋅'Mexico⋅D.F.']
2495
Mexico⋅D.F.
1
['D.F.']
66
D.F.
1
['Distriro⋅Federal']
259
Distriro⋅Federal
3
['Distrito⋅Federal',⋅'DISTRITO⋅FEDERAL',⋅'distrito⋅federal']
11930
Distrito⋅Federal
Viewing 5 of 5 rows / 4 columns
200 partition(s)

In [9]:
keyCol.n_gram_fingerprint(df, 'STATE', 2).table(columns=["STATE","STATE_NGRAM", "STATE_NGRAM_FINGERPRINT"])


STATE_NGRAM STATE_NGRAM_FINGERPRINT
Viewing 10 of 15560 rows / 138 columns
1 partition(s)
STATE
1 (string)
nullable
STATE_NGRAM
2 (array<string>)
not nullable
STATE_NGRAM_FINGERPRINT
3 (string)
nullable
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Distrito⋅Federal
['distritofederal']
Viewing 10 of 15560 rows / 138 columns
1 partition(s)

In [10]:
keyCol.n_gram_fingerprint_cluster(df, "STATE" , 2).table()


STATE_NGRAM STATE_NGRAM_FINGERPRINT
Viewing 1 of 1 rows / 4 columns
200 partition(s)
STATE_CLUSTER_SIZE
1 (int)
not nullable
STATE_CLUSTER
2 (array<string>)
nullable
STATE_COUNT
3 (double)
nullable
STATE_RECOMMENDED
4 (string)
nullable
8
['Distrito⋅Federal',⋅'México⋅D.F.',⋅'DISTRITO⋅FEDERAL',⋅'Mexico⋅D.F.',⋅'Distr...
15560.0
Mexico⋅D.F.
Viewing 1 of 1 rows / 4 columns
200 partition(s)

In [11]:
keyCol.n_gram_fingerprint_cluster(df, "STATE" , 2).to_json()


STATE_NGRAM STATE_NGRAM_FINGERPRINT
Out[11]:
[{'STATE_CLUSTER_SIZE': 8,
  'STATE_CLUSTER': ['Distrito Federal',
   'México D.F.',
   'DISTRITO FEDERAL',
   'Mexico D.F.',
   'Distriro Federal',
   'D.F.',
   'Estado de México',
   'distrito federal'],
  'STATE_COUNT': 15560.0,
  'STATE_RECOMMENDED': 'Mexico D.F.'}]

Distance Cluster


In [12]:
from optimus.ml import distancecluster as dc
dc.levenshtein_matrix(df,"STATE").table()


Viewing 10 of 25 rows / 3 columns
200 partition(s)
STATE_LEVENSHTEIN_1
1 (string)
nullable
STATE_LEVENSHTEIN_2
2 (string)
nullable
STATE_LEVENSHTEIN_DISTANCE
3 (int)
nullable
estadodemexico
estadodemexico
0
estadodemexico
mexicodf
10
estadodemexico
df
13
estadodemexico
distrirofederal
11
estadodemexico
distritofederal
11
mexicodf
estadodemexico
10
mexicodf
mexicodf
0
mexicodf
df
6
mexicodf
distrirofederal
12
mexicodf
distritofederal
12
Viewing 10 of 25 rows / 3 columns
200 partition(s)

In [13]:
dc.levenshtein_filter(df,"STATE").table()


Viewing 5 of 5 rows / 3 columns
200 partition(s)
STATE_FROM
1 (string)
nullable
STATE_LEVENSHTEIN_DISTANCE
2 (int)
nullable
STATE_TO
3 (string)
nullable
estadodemexico
10
mexicodf
df
6
mexicodf
distrirofederal
1
distritofederal
distritofederal
1
distrirofederal
mexicodf
6
df
Viewing 5 of 5 rows / 3 columns
200 partition(s)

In [24]:
dc.levenshtein_cluster(df,"STATE").to_json()


Out[24]:
[{'STATE_CLUSTER': ['Estado de México'],
  'STATE_CLUSTER_SIZE': 1,
  'STATE_RECOMMENDED': 'Estado de México',
  'STATE_COUNT': 810},
 {'STATE_CLUSTER': ['D.F.'],
  'STATE_CLUSTER_SIZE': 1,
  'STATE_RECOMMENDED': 'D.F.',
  'STATE_COUNT': 66},
 {'STATE_CLUSTER': ['Distriro Federal'],
  'STATE_CLUSTER_SIZE': 1,
  'STATE_RECOMMENDED': 'Distriro Federal',
  'STATE_COUNT': 259},
 {'STATE_CLUSTER': ['Distrito Federal',
   'DISTRITO FEDERAL',
   'distrito federal'],
  'STATE_CLUSTER_SIZE': 3,
  'STATE_RECOMMENDED': 'Distrito Federal',
  'STATE_COUNT': 11930},
 {'STATE_CLUSTER': ['Mexico D.F.', 'México D.F.'],
  'STATE_CLUSTER_SIZE': 2,
  'STATE_RECOMMENDED': 'Mexico D.F.',
  'STATE_COUNT': 2495}]

Feature Engineering


In [16]:
data = [('Japan', 'Tokyo', 37800000),('USA', 'New York', 19795791),('France', 'Paris', 12341418),
              ('Spain','Madrid',6489162)]
df = op.spark.createDataFrame(data, ["country", "city", "population"])

In [17]:
df.table()


Viewing 4 of 4 rows / 3 columns
8 partition(s)
country
1 (string)
nullable
city
2 (string)
nullable
population
3 (bigint)
nullable
Japan
Tokyo
37800000
USA
New⋅York
19795791
France
Paris
12341418
Spain
Madrid
6489162
Viewing 4 of 4 rows / 3 columns
8 partition(s)

In [18]:
from optimus.ml import feature as fe

 String to Index


In [19]:
df_sti = fe.string_to_index(df, input_cols=["city", "country"])

In [20]:
df_sti.table()


Viewing 4 of 4 rows / 5 columns
8 partition(s)
country
1 (string)
nullable
city
2 (string)
nullable
population
3 (bigint)
nullable
city_INDEX
4 (double)
not nullable
country_INDEX
5 (double)
not nullable
Japan
Tokyo
37800000
2.0
3.0
USA
New⋅York
19795791
3.0
2.0
France
Paris
12341418
0.0
1.0
Spain
Madrid
6489162
1.0
0.0
Viewing 4 of 4 rows / 5 columns
8 partition(s)

Index to string


In [27]:
# Going back to strings from index
df_its = fe.index_to_string(df_sti, input_cols=["country_INDEX"])

# Show DF with column "county_index" back to string
df_its.table()


Viewing 4 of 4 rows / 6 columns
8 partition(s)
country
1 (string)
nullable
city
2 (string)
nullable
population
3 (bigint)
nullable
city_INDEX
4 (double)
not nullable
country_INDEX
5 (double)
not nullable
country_INDEX_string
6 (string)
nullable
Japan
Tokyo
37800000
2.0
3.0
Japan
USA
New⋅York
19795791
3.0
2.0
USA
France
Paris
12341418
0.0
1.0
France
Spain
Madrid
6489162
1.0
0.0
Spain
Viewing 4 of 4 rows / 6 columns
8 partition(s)

One Hot Enconder


In [28]:
# Creating DataFrame
data = [
(0, "a"),
(1, "b"),
(2, "c"),
(3, "a"),
(4, "a"),
(5, "c")
]
df = op.spark.createDataFrame(data,["id", "category"])

# One Hot Encoding
df_ohe = fe.one_hot_encoder(df, input_cols=["id"])

# Show encoded dataframe
df_ohe.table()


Viewing 6 of 6 rows / 3 columns
8 partition(s)
id
1 (bigint)
nullable
category
2 (string)
nullable
id__ENCODED
3 (vector)
nullable
0
a
(5,[0],[1.0])
1
b
(5,[1],[1.0])
2
c
(5,[2],[1.0])
3
a
(5,[3],[1.0])
4
a
(5,[4],[1.0])
5
c
(5,[],[])
Viewing 6 of 6 rows / 3 columns
8 partition(s)

Vector assembler


In [29]:
# Import Vectors
from pyspark.ml.linalg import Vectors

# Creating DataFrame
data = [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)]

df = op.spark.createDataFrame(data,["id", "hour", "mobile", "user_features", "clicked"])

# Assemble features
df_va = fe.vector_assembler(df, input_cols=["hour", "mobile", "user_features"])

# Show assembled df
print("Assembled columns 'hour', 'mobile', 'user_features' to vector column 'features'")
df_va.select("features", "clicked").table()


Assembled columns 'hour', 'mobile', 'user_features' to vector column 'features'
Viewing 1 of 1 rows / 2 columns
8 partition(s)
features
1 (vector)
nullable
clicked
2 (double)
nullable
[18.0,1.0,0.0,10.0,0.5]
1.0
Viewing 1 of 1 rows / 2 columns
8 partition(s)

Normalizer


In [47]:
# Import Vectors
from pyspark.ml.linalg import Vectors

data = [
(0, Vectors.dense([1.0, 0.5, -1.0]),),
(1, Vectors.dense([2.0, 1.0, 1.0]),),
(2, Vectors.dense([4.0, 10.0, 2.0]),)
]

df = op.spark.createDataFrame(data,["id", "features"])

df_norm = fe.normalizer(df, input_cols=["features"], p=2.0)

df_norm.table()


Viewing 3 of 3 rows / 2 columns
8 partition(s)
id
1 (bigint)
nullable
features
2 (vector)
nullable
0
[1.0,0.5,-1.0]
1
[2.0,1.0,1.0]
2
[4.0,10.0,2.0]
Viewing 3 of 3 rows / 2 columns
8 partition(s)
Viewing 3 of 3 rows / 3 columns
8 partition(s)
id
1 (bigint)
nullable
features
2 (vector)
nullable
features_NORMALIZED
3 (vector)
nullable
0
[1.0,0.5,-1.0]
[0.6666666666666666,0.3333333333333333,-0.6666666666666666]
1
[2.0,1.0,1.0]
[0.8164965809277261,0.4082482904638631,0.4082482904638631]
2
[4.0,10.0,2.0]
[0.3651483716701107,0.9128709291752769,0.18257418583505536]
Viewing 3 of 3 rows / 3 columns
8 partition(s)