notebook.community

Edit and run



In [1]:

    
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession



In [2]:

    
spark = SparkSession.builder.master("local")\
    .appName("Association Rules FP-Growth")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])

fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
df_rules = model.associationRules









    



+---------+----+
|    items|freq|
+---------+----+
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
|      [2]|   3|
|   [2, 1]|   3|
|      [1]|   3|
+---------+----+



In [4]:

    
from pyspark.sql.functions import array, lit

df_rules.where(df_rules.consequent == array(lit(1L))).show()









    



+----------+----------+----------+
|antecedent|consequent|confidence|
+----------+----------+----------+
|    [5, 2]|       [1]|       1.0|
|       [5]|       [1]|       1.0|
|       [2]|       [1]|       1.0|
+----------+----------+----------+