In [1]:
# Spark import
import pyspark.sql.functions as f
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import Window
from pyspark.sql.types import *
conf = SparkConf() \
.setAppName('pruebas') \
.setMaster('yarn-client') \
.set("spark.executor.memory", "4g") \
.set("spark.executor.cores", "4") \
.set("spark.executor.instances", "2") \
.set("spark.driver.memory", "4g") \
.set("spark.driver.cores", "2") \
.set("spark.yarn.am.cores", '4') \
.set("spark.yarn.am.memory", '8g') \
.set("spark.logConf", "false") \
.set("spark.kryoserializer.buffer.max", "2000MB") \
.set("spark.app.id", "myapp") \
.set("spark.ui.port", "4235") \
.set("spark.yarn.queue", "gbic") \
.set("spark.driver.maxResultSize","2048MB")
#if sc:
# sc.stop()
sc = SparkContext(conf = conf)
sqlContext = HiveContext(sc)
In [7]:
from pyspark.sql.functions import rand, randn
# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)
In [9]:
df
Out[9]:
In [10]:
# Generate two other columns using uniform distribution and normal distribution.
#df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()
df = df.withColumn('uniform', rand(seed=10))
df = df.withColumn('normal', randn(seed=27))
df.cache()
df.show()
In [11]:
# generate a total sum of a column
from pyspark.sql.functions import mean, min, max, sum
fff = df.select([sum('id')])
fff.head()[0]
Out[11]:
In [12]:
fff.head()
Out[12]: