In [1]:
# create entry points to spark
try:
sc.stop()
except:
pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)
There are two functions we can use to split a continuous variable into categories:
pyspark.ml.feature.Binarizer
: split a column of continuous features given a thresholdpyspark.ml.feature.Bucktizer
: split a column of continuous features into categories given several breaking points.
In [2]:
import numpy as np
import pandas as pd
np.random.seed(seed=1234)
pdf = pd.DataFrame({
'x1': np.random.randn(10),
'x2': np.random.rand(10)*10
})
np.random.seed(seed=None)
df = spark.createDataFrame(pdf)
df.show()
In [3]:
from pyspark.ml.feature import Binarizer, Bucketizer
# threshold = 0 for binarizer
binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')
# provide 5 split points to generate 4 buckets
bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')
# pipeline stages
from pyspark.ml import Pipeline
stages = [binarizer, bucketizer]
pipeline = Pipeline(stages=stages)
# fit the pipeline model and transform the data
pipeline.fit(df).transform(df).show()
In [ ]: