Term frequency-inverse document frequency


In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [13]:
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (1.0, "I wish Java could use case classes"),
    (2.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [14]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

In [15]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

In [16]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [19]:
rescaledData.select("label", "features").show(truncate=False)


+-----+----------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                              |
+-----+----------------------------------------------------------------------------------------------------------------------+
|0.0  |(20,[0,5,9,17],[0.6931471805599453,0.6931471805599453,0.28768207245178085,1.3862943611198906])                        |
|1.0  |(20,[2,7,9,13,15],[0.6931471805599453,0.6931471805599453,0.8630462173553426,0.28768207245178085,0.28768207245178085]) |
|2.0  |(20,[4,6,13,15,18],[0.6931471805599453,0.6931471805599453,0.28768207245178085,0.28768207245178085,0.6931471805599453])|
+-----+----------------------------------------------------------------------------------------------------------------------+


In [ ]:


In [ ]: