In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
In [3]:
In [31]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
In [32]:
l = [('Alice', 16),('john', 82),('george', 13),('mark', 24)]
df = sqlContext.createDataFrame(l,['name', 'age'])
In [33]:
df.show()
In [34]:
maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())
In [35]:
df1 = df.withColumn("maturity", maturity_udf(df.age))
In [36]:
df1.printSchema()
In [37]:
df1.show()
In [43]:
def condition(r):
if (r < 1):
r = "infant"
elif(1<=r <=18):
r = "child"
else:
r = "adult"
return r
In [44]:
#maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())
maturity_udf = udf(lambda x: condition(x), StringType())
In [45]:
df2 = df.withColumn("maturity", maturity_udf(df.age))
In [46]:
df2.printSchema()
In [47]:
df2.show()
In [ ]: