HashingTF and CountVectorizer

Load data


In [16]:
twitter = spark.createDataFrame([
                                ('Wenqiang is a spark expert', 'Wenqiang', 1.0),
                                ('Ming is learning spark', 'Ming', 0.0)],
                                ['text', 'id', 'label']
                               )

In [17]:
twitter.show()


+--------------------+--------+-----+
|                text|      id|label|
+--------------------+--------+-----+
|Wenqiang is a spa...|Wenqiang|  1.0|
|Ming is learning ...|    Ming|  0.0|
+--------------------+--------+-----+

Tokenization


In [18]:
from pyspark.ml.feature import Tokenizer

In [35]:
tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')
twitter_tokens = tokenizer_mod.transform(twitter)
twitter_tokens.show()


+--------------------+--------+-----+--------------------+
|                text|      id|label|              tokens|
+--------------------+--------+-----+--------------------+
|Wenqiang is a spa...|Wenqiang|  1.0|[wenqiang, is, a,...|
|Ming is learning ...|    Ming|  0.0|[ming, is, learni...|
+--------------------+--------+-----+--------------------+

HashingTF


In [36]:
from pyspark.ml.feature import HashingTF
hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features')
hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)

In [37]:
hashingTF_twitter.show(truncate=False)


+--------------------------+--------+-----+--------------------------------+---------------------------------+
|text                      |id      |label|tokens                          |features                         |
+--------------------------+--------+-----+--------------------------------+---------------------------------+
|Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(16,[1,2,9,13],[2.0,1.0,1.0,1.0])|
|Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(16,[0,1,14],[1.0,2.0,1.0])      |
+--------------------------+--------+-----+--------------------------------+---------------------------------+

CountVectorizer


In [38]:
from pyspark.ml.feature import CountVectorizer
count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')
countVectorizer_mod = count_vectorizer.fit(twitter_tokens)
countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)

In [39]:
countVectorizer_twitter.show(truncate=False)


+--------------------------+--------+-----+--------------------------------+-------------------------------------+
|text                      |id      |label|tokens                          |features                             |
+--------------------------+--------+-----+--------------------------------+-------------------------------------+
|Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(7,[0,1,2,3,5],[1.0,1.0,1.0,1.0,1.0])|
|Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(7,[0,1,4,6],[1.0,1.0,1.0,1.0])      |
+--------------------------+--------+-----+--------------------------------+-------------------------------------+


In [ ]: