In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:


In [31]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

In [32]:
l = [('Alice', 16),('john', 82),('george', 13),('mark', 24)]
df = sqlContext.createDataFrame(l,['name', 'age'])

In [33]:
df.show()


+------+---+
|  name|age|
+------+---+
| Alice| 16|
|  john| 82|
|george| 13|
|  mark| 24|
+------+---+


In [34]:
maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())

In [35]:
df1 = df.withColumn("maturity", maturity_udf(df.age))

In [36]:
df1.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- maturity: string (nullable = true)


In [37]:
df1.show()


+------+---+--------+
|  name|age|maturity|
+------+---+--------+
| Alice| 16|   child|
|  john| 82|   adult|
|george| 13|   child|
|  mark| 24|   adult|
+------+---+--------+


In [43]:
def condition(r):
    if (r < 1):
        r = "infant" 
    elif(1<=r <=18):
        r = "child"
    else: 
        r = "adult" 
    return r

In [44]:
#maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())
maturity_udf = udf(lambda x: condition(x), StringType())

In [45]:
df2 = df.withColumn("maturity", maturity_udf(df.age))

In [46]:
df2.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- maturity: string (nullable = true)


In [47]:
df2.show()


+------+---+--------+
|  name|age|maturity|
+------+---+--------+
| Alice| 16|   child|
|  john| 82|   adult|
|george| 13|   child|
|  mark| 24|   adult|
+------+---+--------+


In [ ]: