In [5]:
# We'll start by seeing if we can access the Spark Context which represents the gateway into the Spark API:

sc # = SparkContext(appName="protomodel")


Out[5]:
<pyspark.context.SparkContext at 0x7ff75ad21c50>

In [3]:
sqlCtx


Out[3]:
<pyspark.sql.context.HiveContext at 0x7f1e13518050>

In [1]:
# We'll then create an RDD using sc.parallelize with 20 partitions which will be distributed amongst the Spark 
# Worker nodes and also verify the number of partitions in the RDD:
rdd = sc.parallelize(range(1000), 20)  
rdd.getNumPartitions()


Out[1]:
20

In [6]:
rawDF = sqlCtx.sql("select * from stage_churn.mou_controle limit 5")

In [7]:
rawDF.collect() # el comando anterior se ejecuta solo con ejecutar collect(), "lazy evaluation"


Out[7]:
[Row(nr_tlfn=u'11940140231', id_lnha=u'1029977430', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
 Row(nr_tlfn=u'11940140532', id_lnha=u'1035172632', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=None, mou_out_sum_m2=None, mou_in_sum_m2=None, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
 Row(nr_tlfn=u'11940142751', id_lnha=u'1001044994', mou_total_sum_m1=24.5, mou_out_sum_m1=20.51666666666667, mou_in_sum_m1=3.983333333333334, mou_total_sum_m2=28.983333333333334, mou_out_sum_m2=21.633333333333333, mou_in_sum_m2=7.349999999999999, mou_total_sum_m3=40.366666666666674, mou_out_sum_m3=38.56666666666667, mou_in_sum_m3=1.8, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=31.283333333333335, mou_out_sum_avg3=26.905555555555555, mou_in_sum_avg3=4.377777777777777, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
 Row(nr_tlfn=u'11941001944', id_lnha=u'1030111149', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=None, mou_out_sum_m3=None, mou_in_sum_m3=None, mou_total_sum_m4=None, mou_out_sum_m4=None, mou_in_sum_m4=None, mou_total_sum_m5=None, mou_out_sum_m5=None, mou_in_sum_m5=None, mou_total_sum_m6=None, mou_out_sum_m6=None, mou_in_sum_m6=None, mou_total_sum_avg3=None, mou_out_sum_avg3=None, mou_in_sum_avg3=None, mou_total_sum_avg6=None, mou_out_sum_avg6=None, mou_in_sum_avg6=None),
 Row(nr_tlfn=u'11941005319', id_lnha=u'898511570', mou_total_sum_m1=49.583333333333336, mou_out_sum_m1=40.81666666666666, mou_in_sum_m1=8.766666666666666, mou_total_sum_m2=36.333333333333336, mou_out_sum_m2=29.316666666666674, mou_in_sum_m2=7.016666666666666, mou_total_sum_m3=66.35, mou_out_sum_m3=55.516666666666666, mou_in_sum_m3=10.833333333333334, mou_total_sum_m4=16.88333333333333, mou_out_sum_m4=12.449999999999996, mou_in_sum_m4=4.433333333333334, mou_total_sum_m5=13.133333333333335, mou_out_sum_m5=8.083333333333334, mou_in_sum_m5=6.150000000000001, mou_total_sum_m6=19.916666666666664, mou_out_sum_m6=11.616666666666667, mou_in_sum_m6=8.633333333333333, mou_total_sum_avg3=50.75555555555555, mou_out_sum_avg3=41.88333333333333, mou_in_sum_avg3=8.872222222222222, mou_total_sum_avg6=33.699999999999996, mou_out_sum_avg6=26.3, mou_in_sum_avg6=7.638888888888889)]

In [19]:
rawDF.take(2)


Out[19]:
[Row(nr_tlfn=u'11930001092', id_lnha=u'1015329050', mou_total_sum_m1=0.0, mou_out_sum_m1=0.0, mou_in_sum_m1=0.0, mou_total_sum_m2=0.0, mou_out_sum_m2=0.0, mou_in_sum_m2=0.0, mou_total_sum_m3=0.0, mou_out_sum_m3=0.0, mou_in_sum_m3=0.0, mou_total_sum_m4=0.0, mou_out_sum_m4=0.0, mou_in_sum_m4=0.0, mou_total_sum_m5=0.0, mou_out_sum_m5=0.0, mou_in_sum_m5=0.0, mou_total_sum_m6=0.0, mou_out_sum_m6=0.0, mou_in_sum_m6=0.0, mou_total_sum_avg3=0.0, mou_out_sum_avg3=0.0, mou_in_sum_avg3=0.0, mou_total_sum_avg6=0.0, mou_out_sum_avg6=0.0, mou_in_sum_avg6=0.0),
 Row(nr_tlfn=u'11930006809', id_lnha=u'888434927', mou_total_sum_m1=166.6833333333333, mou_out_sum_m1=63.30000000000001, mou_in_sum_m1=103.38333333333333, mou_total_sum_m2=134.2333333333333, mou_out_sum_m2=73.63333333333331, mou_in_sum_m2=60.600000000000016, mou_total_sum_m3=348.8166666666668, mou_out_sum_m3=176.3166666666666, mou_in_sum_m3=172.50000000000003, mou_total_sum_m4=177.96666666666673, mou_out_sum_m4=123.15, mou_in_sum_m4=54.81666666666666, mou_total_sum_m5=493.4166666666665, mou_out_sum_m5=297.28333333333325, mou_in_sum_m5=196.1333333333333, mou_total_sum_m6=293.78333333333336, mou_out_sum_m6=163.51666666666668, mou_in_sum_m6=130.26666666666668, mou_total_sum_avg3=216.57777777777778, mou_out_sum_avg3=104.41666666666664, mou_in_sum_avg3=112.16111111111111, mou_total_sum_avg6=269.15, mou_out_sum_avg6=149.5333333333333, mou_in_sum_avg6=119.61666666666666)]

In [16]:
rawDF.toPandas() # se recomienda meter solo hasta 500.000 registros, los datasets pandas siempre están en RAM


Out[16]:
nr_tlfn id_lnha mou_total_sum_m1 mou_out_sum_m1 mou_in_sum_m1 mou_total_sum_m2 mou_out_sum_m2 mou_in_sum_m2 mou_total_sum_m3 mou_out_sum_m3 ... mou_in_sum_m5 mou_total_sum_m6 mou_out_sum_m6 mou_in_sum_m6 mou_total_sum_avg3 mou_out_sum_avg3 mou_in_sum_avg3 mou_total_sum_avg6 mou_out_sum_avg6 mou_in_sum_avg6
0 11930021068 1026528111 0.000000 0.000000 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 ... NaN NaN NaN NaN 0.000000 0.000000 0.000000 NaN NaN NaN
1 11940119432 1029979568 0.000000 0.000000 0.000000 0.00 0.000000 0.000000 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 11940379107 1031472408 0.000000 0.000000 0.000000 0.00 0.000000 0.000000 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 11940408888 1035936569 0.000000 0.000000 0.000000 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 11941000671 1010277127 416.216667 389.083333 27.133333 142.35 127.616667 14.733333 125.766667 95.816667 ... 18.316667 62.2 39.233333 22.966667 228.111111 204.172222 23.938889 151.772222 129.819444 21.952778

5 rows × 26 columns


In [17]:
rawDF.show()


+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+
|    nr_tlfn|   id_lnha|mou_total_sum_m1|mou_out_sum_m1|mou_in_sum_m1|   mou_total_sum_m2|mou_out_sum_m2|      mou_in_sum_m2|mou_total_sum_m3|mou_out_sum_m3|mou_in_sum_m3|mou_total_sum_m4|mou_out_sum_m4|mou_in_sum_m4|mou_total_sum_m5|mou_out_sum_m5|mou_in_sum_m5|mou_total_sum_m6|mou_out_sum_m6|mou_in_sum_m6|  mou_total_sum_avg3|mou_out_sum_avg3|     mou_in_sum_avg3|  mou_total_sum_avg6|mou_out_sum_avg6|     mou_in_sum_avg6|
+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+
|11931471767|1024353299|             0.0|           0.0|          0.0|                0.0|           0.0|                0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|            null|          null|         null|            null|          null|         null|                 0.0|             0.0|                 0.0|                null|            null|                null|
|11940019106|1031108303|             0.0|           0.0|          0.0|                0.0|           0.0|                0.0|            null|          null|         null|            null|          null|         null|            null|          null|         null|            null|          null|         null|                null|            null|                null|                null|            null|                null|
|11940070246|1010278473|             0.0|           0.0|          0.0|                0.0|           0.0|                0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|                 0.0|             0.0|                 0.0|                 0.0|             0.0|                 0.0|
|11940118241|1029006194|             0.0|           0.0|          0.0|                0.0|           0.0|                0.0|             0.0|           0.0|          0.0|            null|          null|         null|            null|          null|         null|            null|          null|         null|                 0.0|             0.0|                 0.0|                null|            null|                null|
|11941000268| 864929455|             0.0|           0.0|          0.0|0.06666666666666667|           0.0|0.06666666666666667|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|             0.0|           0.0|          0.0|0.022222222222222223|             0.0|0.022222222222222223|0.011111111111111112|             0.0|0.011111111111111112|
+-----------+----------+----------------+--------------+-------------+-------------------+--------------+-------------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+----------------+--------------+-------------+--------------------+----------------+--------------------+--------------------+----------------+--------------------+


In [20]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)
distData


Out[20]:
ParallelCollectionRDD[55] at parallelize at PythonRDD.scala:423

In [21]:



Out[21]:
DataFrame[nr_tlfn: string, id_lnha: string, mou_total_sum_m1: double, mou_out_sum_m1: double, mou_in_sum_m1: double, mou_total_sum_m2: double, mou_out_sum_m2: double, mou_in_sum_m2: double, mou_total_sum_m3: double, mou_out_sum_m3: double, mou_in_sum_m3: double, mou_total_sum_m4: double, mou_out_sum_m4: double, mou_in_sum_m4: double, mou_total_sum_m5: double, mou_out_sum_m5: double, mou_in_sum_m5: double, mou_total_sum_m6: double, mou_out_sum_m6: double, mou_in_sum_m6: double, mou_total_sum_avg3: double, mou_out_sum_avg3: double, mou_in_sum_avg3: double, mou_total_sum_avg6: double, mou_out_sum_avg6: double, mou_in_sum_avg6: double]