In [1]:
# Spark import
import pyspark.sql.functions as f
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import Window
from pyspark.sql.types import *


conf = SparkConf() \
    .setAppName('pruebas') \
    .setMaster('yarn-client') \
    .set("spark.executor.memory", "4g") \
    .set("spark.executor.cores", "4") \
    .set("spark.executor.instances", "2") \
    .set("spark.driver.memory", "4g") \
    .set("spark.driver.cores", "2") \
    .set("spark.yarn.am.cores", '4') \
    .set("spark.yarn.am.memory", '8g') \
    .set("spark.logConf", "false") \
    .set("spark.kryoserializer.buffer.max", "2000MB") \
    .set("spark.app.id", "myapp") \
    .set("spark.ui.port", "4235") \
    .set("spark.yarn.queue", "gbic") \
    .set("spark.driver.maxResultSize","2048MB")    

#if sc:
 #   sc.stop()

sc = SparkContext(conf = conf)
sqlContext = HiveContext(sc)

In [7]:
from pyspark.sql.functions import rand, randn

# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)

In [9]:
df


Out[9]:
DataFrame[id: bigint]

In [10]:
# Generate two other columns using uniform distribution and normal distribution.

#df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()
df = df.withColumn('uniform', rand(seed=10))
df = df.withColumn('normal', randn(seed=27))

df.cache()
df.show()


+---+-------------------+--------------------+
| id|            uniform|              normal|
+---+-------------------+--------------------+
|  0|0.41371264720975787|  0.5888539012978773|
|  1| 0.1982919638208397| 0.06157382353970104|
|  2|0.12030715258495939|  1.0854146699817222|
|  3|0.44292918521277047| -0.4798519469521663|
|  4| 0.8898784253886249| -0.8820294772950535|
|  5| 0.2731073068483362|-0.15116027592854422|
|  6|   0.87079354700073|-0.27674189870783683|
|  7|0.27149331793166864|-0.18575112254167045|
|  8| 0.6037143578435027|   0.734722467897308|
|  9| 0.1435668838975337|-0.30123700668427145|
+---+-------------------+--------------------+


In [11]:
# generate a total sum of a column 
from pyspark.sql.functions import mean, min, max, sum

fff = df.select([sum('id')])
fff.head()[0]


Out[11]:
45

In [12]:
fff.head()


Out[12]:
Row(sum(id)=45)