notebook.community

Edit and run



In [ ]:

    
val textFile = spark.read.textFile("/home/jovyan/work/data/imdb_master.csv")

textFile.count() 

textFile.printSchema()

textFile.show()

textFile.first() // F


val linesWithSpark = textFile.filter(line => line.contains("review"))



In [ ]:

    
%AddJar file:///home/jovyan/work/data/postgresql-42.2.2.jre7.jar
%AddJar file:///home/jovyan/work/data/greenplum-spark_2.11-1.5.0.jar



In [ ]:

    
Class.forName("org.postgresql.Driver")

Class.forName("io.pivotal.greenplum.spark.GreenplumRelationProvider")



In [ ]:

    
// https://dzone.com/articles/parsing-and-querying-csv-apache-spark

import org.apache.spark.sql.SQLContext


val sqlContext = new SQLContext(sc)

// val df = sqlContext.read.format("csv").load("/home/jovyan/work/data/imdb_master.csv")
// df.printSchema()


 val imdb_master = sqlContext.read.format("csv")
   .option("header", "true")
   .option("inferSchema", "true")
   .load("/home/jovyan/work/data/imdb_master.csv")
imdb_master.printSchema()


val selectedType = imdb_master.select("type")
  selectedType.write.mode("Append").save("/home/jovyan/work/data/type.csv")
selectedType.show()



 val tempTable = imdb_master.registerTempTable("my_table")
  //makes a temporary table
  val usingSQL = sqlContext
    .sql("select * from my_table") 
//show all the csv file's data in temp table
  usingSQL.show()



In [ ]:

    
val dataFrame = spark.read.format("io.pivotal.greenplum.spark.GreenplumRelationProvider")
.option("dbtable", "usertable")
.option("url", "jdbc:postgresql://gpdbsne/basic_db")
.option("user", "gpadmin")
.option("password", "pivotal")
.option("driver", "org.postgresql.Driver")
.option("partitionColumn", "id")
.load()

//.option("partitionColumn", "id")



In [ ]: