In [5]:
# We'll start by seeing if we can access the Spark Context which represents the gateway into the Spark API:
sc # = SparkContext(appName="protomodel")
Out[5]:
<pyspark.context.SparkContext at 0x7ff75ad21c50>
In [3]:
sqlCtx
Out[3]:
<pyspark.sql.context.HiveContext at 0x7f1e13518050>
In [1]:
# We'll then create an RDD using sc.parallelize with 20 partitions which will be distributed amongst the Spark
# Worker nodes and also verify the number of partitions in the RDD:
rdd = sc.parallelize(range(1000), 20)
rdd.getNumPartitions()
Out[1]:
20
In [6]:
rawDF = sqlCtx.sql("select * from stage_churn.mou_controle limit 5")
In [7]:
rawDF.collect() # el comando anterior se ejecuta solo con ejecutar collect(), "lazy evaluation"
Out[7]:
[Row(nr_tlfn=u'11940140231', id_lnha=u'1029977430', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
Row(nr_tlfn=u'11940140532', id_lnha=u'1035172632', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=None, mou_out_sum_m2=None, mou_in_sum_m2=None, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
Row(nr_tlfn=u'11940142751', id_lnha=u'1001044994', mou_total_sum_m1=24.5, mou_out_sum_m1=20.51666666666667, mou_in_sum_m1=3.983333333333334, mou_total_sum_m2=28.983333333333334, mou_out_sum_m2=21.633333333333333, mou_in_sum_m2=7.349999999999999, mou_total_sum_m3=40.366666666666674, mou_out_sum_m3=38.56666666666667, mou_in_sum_m3=1.8, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=31.283333333333335, mou_out_sum_avg3=26.905555555555555, mou_in_sum_avg3=4.377777777777777, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
Row(nr_tlfn=u'11941001944', id_lnha=u'1030111149', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
Row(nr_tlfn=u'11941005319', id_lnha=u'898511570', mou_total_sum_m1=49.583333333333336, mou_out_sum_m1=40.81666666666666, mou_in_sum_m1=8.766666666666666, mou_total_sum_m2=36.333333333333336, mou_out_sum_m2=29.316666666666674, mou_in_sum_m2=7.016666666666666, mou_total_sum_m3=66.35, mou_out_sum_m3=55.516666666666666, mou_in_sum_m3=10.833333333333334, mou_total_sum_m4=16.88333333333333, mou_out_sum_m4=12.449999999999996, mou_in_sum_m4=4.433333333333334, mou_total_sum_m5=13.133333333333335, mou_out_sum_m5=8.083333333333334, mou_in_sum_m5=6.150000000000001, mou_total_sum_m6=19.916666666666664, mou_out_sum_m6=11.616666666666667, mou_in_sum_m6=8.633333333333333, mou_total_sum_avg3=50.75555555555555, mou_out_sum_avg3=41.88333333333333, mou_in_sum_avg3=8.872222222222222, mou_total_sum_avg6=33.699999999999996, mou_out_sum_avg6=26.3, mou_in_sum_avg6=7.638888888888889)]
In [19]:
rawDF.take(2)
Out[19]:
[Row(nr_tlfn=u'11930001092', id_lnha=u'1015329050', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=0.0, mou_out_sum_m3=0.0, mou_in_sum_m3=0.0, mou_total_sum_m4=0.0, mou_out_sum_m4=0.0, mou_in_sum_m4=0.0, mou_total_sum_m5=0.0, mou_out_sum_m5=0.0, mou_in_sum_m5=0.0, mou_total_sum_m6=0.0, mou_out_sum_m6=0.0, mou_in_sum_m6=0.0, mou_total_sum_avg3=0.0, mou_out_sum_avg3=0.0, mou_in_sum_avg3=0.0, mou_total_sum_avg6=0.0, mou_out_sum_avg6=0.0, mou_in_sum_avg6=0.0),
Row(nr_tlfn=u'11930006809', id_lnha=u'888434927', mou_total_sum_m1=166.6833333333333, mou_out_sum_m1=63.30000000000001, mou_in_sum_m1=103.38333333333333, mou_total_sum_m2=134.2333333333333, mou_out_sum_m2=73.63333333333331, mou_in_sum_m2=60.600000000000016, mou_total_sum_m3=348.8166666666668, mou_out_sum_m3=176.3166666666666, mou_in_sum_m3=172.50000000000003, mou_total_sum_m4=177.96666666666673, mou_out_sum_m4=123.15, mou_in_sum_m4=54.81666666666666, mou_total_sum_m5=493.4166666666665, mou_out_sum_m5=297.28333333333325, mou_in_sum_m5=196.1333333333333, mou_total_sum_m6=293.78333333333336, mou_out_sum_m6=163.51666666666668, mou_in_sum_m6=130.26666666666668, mou_total_sum_avg3=216.57777777777778, mou_out_sum_avg3=104.41666666666664, mou_in_sum_avg3=112.16111111111111, mou_total_sum_avg6=269.15, mou_out_sum_avg6=149.5333333333333, mou_in_sum_avg6=119.61666666666666)]
In [16]:
rawDF.toPandas() # se recomienda meter solo hasta 500.000 registros, los datasets pandas siempre están en RAM
Out[16]:
nr_tlfn
id_lnha
mou_total_sum_m1
mou_out_sum_m1
mou_in_sum_m1
mou_total_sum_m2
mou_out_sum_m2
mou_in_sum_m2
mou_total_sum_m3
mou_out_sum_m3
...
mou_in_sum_m5
mou_total_sum_m6
mou_out_sum_m6
mou_in_sum_m6
mou_total_sum_avg3
mou_out_sum_avg3
mou_in_sum_avg3
mou_total_sum_avg6
mou_out_sum_avg6
mou_in_sum_avg6
0
11930021068
1026528111
0.000000
0.000000
0.000000
0.00
0.000000
0.000000
0.000000
0.000000
...
NaN
NaN
NaN
NaN
0.000000
0.000000
0.000000
NaN
NaN
NaN
1
11940119432
1029979568
0.000000
0.000000
0.000000
0.00
0.000000
0.000000
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
2
11940379107
1031472408
0.000000
0.000000
0.000000
0.00
0.000000
0.000000
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
3
11940408888
1035936569
0.000000
0.000000
0.000000
NaN
NaN
NaN
NaN
NaN
...
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
4
11941000671
1010277127
416.216667
389.083333
27.133333
142.35
127.616667
14.733333
125.766667
95.816667
...
18.316667
62.2
39.233333
22.966667
228.111111
204.172222
23.938889
151.772222
129.819444
21.952778
5 rows × 26 columns
In [17]:
rawDF.show()
+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+
| nr_tlfn| id_lnha|mou_total_sum_m1|mou_out_sum_m1|mou_in_sum_m1| mou_total_sum_m2|mou_out_sum_m2| mou_in_sum_m2|mou_total_sum_m3|mou_out_sum_m3|mou_in_sum_m3|mou_total_sum_m4|mou_out_sum_m4|mou_in_sum_m4|mou_total_sum_m5|mou_out_sum_m5|mou_in_sum_m5|mou_total_sum_m6|mou_out_sum_m6|mou_in_sum_m6| mou_total_sum_avg3|mou_out_sum_avg3| mou_in_sum_avg3| mou_total_sum_avg6|mou_out_sum_avg6| mou_in_sum_avg6|
+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+
|11931471767|1024353299| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| null| null| null| null| null| null| 0.0| 0.0| 0.0| null| null| null|
|11940019106|1031108303| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null| null|
|11940070246|1010278473| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0|
|11940118241|1029006194| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| null| null| null| null| null| null| null| null| null| 0.0| 0.0| 0.0| null| null| null|
|11941000268| 864929455| 0.0| 0.0| 0.0|0.06666666666666667| 0.0|0.06666666666666667| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0| 0.0|0.022222222222222223| 0.0|0.022222222222222223|0.011111111111111112| 0.0|0.011111111111111112|
+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+
In [20]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
distData
Out[20]:
ParallelCollectionRDD[55] at parallelize at PythonRDD.scala:423
In [21]:
Out[21]:
DataFrame[nr_tlfn: string, id_lnha: string, mou_total_sum_m1: double, mou_out_sum_m1: double, mou_in_sum_m1: double, mou_total_sum_m2: double, mou_out_sum_m2: double, mou_in_sum_m2: double, mou_total_sum_m3: double, mou_out_sum_m3: double, mou_in_sum_m3: double, mou_total_sum_m4: double, mou_out_sum_m4: double, mou_in_sum_m4: double, mou_total_sum_m5: double, mou_out_sum_m5: double, mou_in_sum_m5: double, mou_total_sum_m6: double, mou_out_sum_m6: double, mou_in_sum_m6: double, mou_total_sum_avg3: double, mou_out_sum_avg3: double, mou_in_sum_avg3: double, mou_total_sum_avg6: double, mou_out_sum_avg6: double, mou_in_sum_avg6: double]
Content source: elmi-gemini/pyspark_utils
Similar notebooks: