In [8]:
val textFile = spark.read.textFile("/home/jovyan/work/data/imdb_master.csv")
textFile.count()
textFile.printSchema()
textFile.show()
textFile.first() // F
val linesWithSpark = textFile.filter(line => line.contains("review"))
Out[8]:
In [4]:
Out[4]:
In [6]:
%AddJar file:///home/jovyan/work/data/postgresql-42.2.2.jre7.jar
%AddJar file:///home/jovyan/work/data/greenplum-spark_2.11-1.4.0-alpha-21-g3d01ce6-SNAPSHOT.jar
In [7]:
Class.forName("org.postgresql.Driver")
Class.forName("io.pivotal.greenplum.spark.GreenplumRelationProvider")
Out[7]:
In [34]:
// https://dzone.com/articles/parsing-and-querying-csv-apache-spark
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
// val df = sqlContext.read.format("csv").load("/home/jovyan/work/data/imdb_master.csv")
// df.printSchema()
val imdb_master = sqlContext.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/home/jovyan/work/data/imdb_master.csv")
imdb_master.printSchema()
val selectedType = imdb_master.select("type")
selectedType.write.mode("Append").save("/home/jovyan/work/data/type.csv")
selectedType.show()
val tempTable = imdb_master.registerTempTable("my_table")
//makes a temporary table
val usingSQL = sqlContext
.sql("select * from my_table")
//show all the csv file's data in temp table
usingSQL.show()
Out[34]:
In [16]:
val dataFrame = spark.read.format("io.pivotal.greenplum.spark.GreenplumRelationProvider")
.option("dbtable", "usertable")
.option("url", "jdbc:postgresql://gpdbsne/basic_db")
.option("user", "gpadmin")
.option("password", "pivotal")
.option("driver", "org.postgresql.Driver")
.option("partitionColumn", "id")
.load()
//.option("partitionColumn", "id")
Out[16]:
In [ ]: