notebook.community

Edit and run



In [1]:

    
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)



In [3]:



In [31]:

    
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf



In [32]:

    
l = [('Alice', 16),('john', 82),('george', 13),('mark', 24)]
df = sqlContext.createDataFrame(l,['name', 'age'])



In [33]:

    
df.show()









    



+------+---+
|  name|age|
+------+---+
| Alice| 16|
|  john| 82|
|george| 13|
|  mark| 24|
+------+---+



In [34]:

    
maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())



In [35]:

    
df1 = df.withColumn("maturity", maturity_udf(df.age))



In [36]:

    
df1.printSchema()









    



root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- maturity: string (nullable = true)



In [37]:

    
df1.show()









    



+------+---+--------+
|  name|age|maturity|
+------+---+--------+
| Alice| 16|   child|
|  john| 82|   adult|
|george| 13|   child|
|  mark| 24|   adult|
+------+---+--------+



In [43]:

    
def condition(r):
    if (r < 1):
        r = "infant" 
    elif(1<=r <=18):
        r = "child"
    else: 
        r = "adult" 
    return r



In [44]:

    
#maturity_udf = udf(lambda age: "adult" if age >=18 else "child", StringType())
maturity_udf = udf(lambda x: condition(x), StringType())



In [45]:

    
df2 = df.withColumn("maturity", maturity_udf(df.age))



In [46]:

    
df2.printSchema()









    



root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- maturity: string (nullable = true)



In [47]:

    
df2.show()









    



+------+---+--------+
|  name|age|maturity|
+------+---+--------+
| Alice| 16|   child|
|  john| 82|   adult|
|george| 13|   child|
|  mark| 24|   adult|
+------+---+--------+



In [ ]: