In [1]:
# create entry points to spark
try:
sc.stop()
except:
pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)
Sometimes we need to merge multiple columns in a Dataframe into one column, or split a column into multiple columns. We can easily achieve this by converting a DataFrame to RDD, applying map functions to manipulate elements, and then converting the RDD back to a DataFrame.
In [2]:
mtcars = spark.read.csv(path='../../data/mtcars.csv',
sep=',',
encoding='UTF-8',
comment=None,
header=True,
inferSchema=True)
In [3]:
mtcars.show(n=5)
In [4]:
# adjust first column name
colnames = mtcars.columns
colnames[0] = 'model'
mtcars = mtcars.rdd.toDF(colnames)
mtcars.show(5)
In [5]:
from pyspark.sql import Row
mtcars_rdd = mtcars.rdd.map(lambda x: Row(model=x[0], values=x[1:]))
mtcars_rdd.take(5)
Out[5]:
Then we create a new DataFrame from the obtained RDD.
In [6]:
mtcars_df = spark.createDataFrame(mtcars_rdd)
mtcars_df.show(5, truncate=False)
In [7]:
mtcars_rdd_2 = mtcars_df.rdd.map(lambda x: Row(model=x[0], x1=x[1][:5], x2=x[1][5:]))
# convert RDD back to DataFrame
mtcars_df_2 = spark.createDataFrame(mtcars_rdd_2)
mtcars_df_2.show(5, truncate=False)
In [ ]: