In [ ]:
from pyspark import SparkContext
from py4j.java_gateway import java_import
from pyspark.mllib.common import _to_java_object_rdd

In [ ]:
# Import vnTokenizer from Java
java_import(sc._gateway.jvm, "vn.vitk.tok.Tokenizer")
Tokenizer = sc._jvm.vn.vitk.tok.Tokenizer

In [ ]:
# Load Data set
data = sc.textFile('./data/data_test.txt')
data_rdd_java = _to_java_object_rdd(data) # Convert RDD to JavaRDD

In [ ]:
# Tokenize
dataFolder = '/export/dat/tok'
token = Tokenizer(sc._jsc, dataFolder + "/lexicon.xml", dataFolder + "/regexp.txt")
result = token.tokenize(data_rdd_java)
result.saveAsTextFile('./output/tokenize')