notebook.community

Edit and run



In [8]:

    
val textFile = spark.read.textFile("/home/jovyan/work/data/imdb_master.csv")

textFile.count() 

textFile.printSchema()

textFile.show()

textFile.first() // F


val linesWithSpark = textFile.filter(line => line.contains("review"))









    



root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|,type,review,labe...|
|0,test,"Once agai...|
|1,test,"This is a...|
|2,test,"First of ...|
|3,test,"Not even ...|
|4,test,"Brass pic...|
|5,test,"A funny t...|
|6,test,"This Germ...|
|7,test,"Being a l...|
|8,test,"""Tokyo E...|
|9,test,"Wealthy h...|
|10,test,"Cage pla...|
|11,test,"First of...|
|12,test,"So tell ...|
|13,test,"A big di...|
|14,test,"This fil...|
|15,test,"Here's a...|
|16,test,"At the b...|
|17,test,"Earth ha...|
|18,test,"Many peo...|
+--------------------+
only showing top 20 rows







    





textFile = [value: string]
linesWithSpark = [value: string]







    Out[8]:





[value: string]



In [4]:









    Out[4]:





Name: java.lang.ClassNotFoundException
Message: io.pivotal.greenplum.spark.GreenplumRelationProvider
StackTrace:   at scala.reflect.internal.util.AbstractFileClassLoader.findClass(AbstractFileClassLoader.scala:62)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
  at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
  at java.lang.Class.forName0(Native Method)
  at java.lang.Class.forName(Class.java:264)



In [6]:

    
%AddJar file:///home/jovyan/work/data/postgresql-42.2.2.jre7.jar
%AddJar file:///home/jovyan/work/data/greenplum-spark_2.11-1.4.0-alpha-21-g3d01ce6-SNAPSHOT.jar









    



Using cached version of postgresql-42.2.2.jre7.jar






    





lastException: Throwable = null







    



Starting download from file:///home/jovyan/work/data/greenplum-spark_2.11-1.4.0-alpha-21-g3d01ce6-SNAPSHOT.jar
Finished download of greenplum-spark_2.11-1.4.0-alpha-21-g3d01ce6-SNAPSHOT.jar



In [7]:

    
Class.forName("org.postgresql.Driver")

Class.forName("io.pivotal.greenplum.spark.GreenplumRelationProvider")









    Out[7]:





class io.pivotal.greenplum.spark.GreenplumRelationProvider



In [34]:

    
// https://dzone.com/articles/parsing-and-querying-csv-apache-spark

import org.apache.spark.sql.SQLContext


val sqlContext = new SQLContext(sc)

// val df = sqlContext.read.format("csv").load("/home/jovyan/work/data/imdb_master.csv")
// df.printSchema()


 val imdb_master = sqlContext.read.format("csv")
   .option("header", "true")
   .option("inferSchema", "true")
   .load("/home/jovyan/work/data/imdb_master.csv")
imdb_master.printSchema()


val selectedType = imdb_master.select("type")
  selectedType.write.mode("Append").save("/home/jovyan/work/data/type.csv")
selectedType.show()



 val tempTable = imdb_master.registerTempTable("my_table")
  //makes a temporary table
  val usingSQL = sqlContext
    .sql("select * from my_table") 
//show all the csv file's data in temp table
  usingSQL.show()









    



root
 |-- _c0: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- review: string (nullable = true)
 |-- label: string (nullable = true)
 |-- file: string (nullable = true)

+----+
|type|
+----+
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
|test|
+----+
only showing top 20 rows

+---+----+--------------------+--------------------+--------------------+
|_c0|type|              review|               label|                file|
+---+----+--------------------+--------------------+--------------------+
|  0|test|Once again Mr. Co...|                 neg|             0_2.txt|
|  1|test|This is an exampl...|                 neg|         10000_4.txt|
|  2|test|First of all I ha...|                 neg|         10001_1.txt|
|  3|test|Not even the Beat...|                 neg|         10002_3.txt|
|  4|test|Brass pictures (m...|                 neg|         10003_3.txt|
|  5|test|"A funny thing ha...| the hero is a de...|           excuse me|
|  6|test|This German horro...|                 neg|         10005_2.txt|
|  7|test|"Being a long-tim...| plots and twists...|         forcedly so|
|  8|test|"""Tokyo Eyes"" t...|                talk| and more talk. Y...|
|  9|test|Wealthy horse ran...|                 neg|         10008_4.txt|
| 10|test|"Cage plays a dru...|                 neg|         10009_3.txt|
| 11|test|First of all, I w...|                 neg|          1000_3.txt|
| 12|test|So tell me - what...|                 neg|         10010_2.txt|
| 13|test|A big disappointm...|                 neg|         10011_1.txt|
| 14|test|This film is abso...|                 neg|         10012_1.txt|
| 15|test|Here's a decidedl...|                 neg|         10013_4.txt|
| 16|test|At the bottom end...|                 neg|         10014_2.txt|
| 17|test|Earth has been de...|                 neg|         10015_4.txt|
| 18|test|Many people are s...|                 neg|         10016_3.txt|
| 19|test|"New York family ...| as Niven's daugh...|                 neg|
+---+----+--------------------+--------------------+--------------------+
only showing top 20 rows







    





sqlContext = org.apache.spark.sql.SQLContext@1baf1424
imdb_master = [_c0: int, type: string ... 3 more fields]
selectedType = [type: string]
usingSQL = [_c0: int, type: string ... 3 more fields]







    





warning: there were two deprecation warnings; re-run with -deprecation for details
tempTable: Unit = ()







    Out[34]:





[_c0: int, type: string ... 3 more fields]



In [16]:

    
val dataFrame = spark.read.format("io.pivotal.greenplum.spark.GreenplumRelationProvider")
.option("dbtable", "usertable")
.option("url", "jdbc:postgresql://gpdbsne/basic_db")
.option("user", "gpadmin")
.option("password", "pivotal")
.option("driver", "org.postgresql.Driver")
.option("partitionColumn", "id")
.load()

//.option("partitionColumn", "id")









    





lastException = null







    Out[16]:





Name: java.lang.NoSuchMethodError
Message: org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.getSchema(Ljava/sql/ResultSet;Lorg/apache/spark/sql/jdbc/JdbcDialect;)Lorg/apache/spark/sql/types/StructType;
StackTrace:   at io.pivotal.greenplum.spark.GreenplumRelationProvider.createRelation(GreenplumRelationProvider.scala:29)
  at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:340)
  at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:239)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:227)
  at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:164)



In [ ]: