In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)
In [3]:
mtcars = spark.read.csv('../../data/mtcars.csv', header=True, inferSchema=True)
In [4]:
mtcars.show(3)
In [5]:
mtcars = mtcars.withColumnRenamed('_c0', 'rown_ames')
mtcars.show(3)
In [6]:
new_col_names = [ 'x_' + x for x in mtcars.columns]
new_col_names
Out[6]:
In [7]:
mtcars = mtcars.rdd.toDF(new_col_names)
mtcars.show(3)
In [8]:
twitter = sc.textFile('../../data/twitter.txt')
twitter.take(5)
Out[8]:
In [9]:
from pyspark.sql import DataFrameWriter
Before we write the data into a file, we need to coalesce the data into one sinle partition. Otherwise, there will be multiple output files.
In [10]:
mtcars = mtcars.coalesce(numPartitions=1)
In [12]:
mtcars.write.csv('data/saved-mtcars', header=True)
In [13]:
twitter = twitter.coalesce(numPartitions=1)
In [14]:
twitter.saveAsTextFile('data/saved-twitter')
In [ ]: