In [16]:
twitter = spark.createDataFrame([
('Wenqiang is a spark expert', 'Wenqiang', 1.0),
('Ming is learning spark', 'Ming', 0.0)],
['text', 'id', 'label']
)
In [17]:
twitter.show()
In [18]:
from pyspark.ml.feature import Tokenizer
In [35]:
tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')
twitter_tokens = tokenizer_mod.transform(twitter)
twitter_tokens.show()
In [36]:
from pyspark.ml.feature import HashingTF
hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features')
hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)
In [37]:
hashingTF_twitter.show(truncate=False)
In [38]:
from pyspark.ml.feature import CountVectorizer
count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')
countVectorizer_mod = count_vectorizer.fit(twitter_tokens)
countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)
In [39]:
countVectorizer_twitter.show(truncate=False)
In [ ]: