In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
In [2]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)
In [3]:
from pyspark.ml.linalg import Vector, DenseVector, SparseVector
A vector can be represented in dense and sparse formats. A dense vector is a regular vector that has each elements printed. A sparse vector use three components to represent a vector but with less memory.
In [22]:
dv = DenseVector([1.0,0.,0.,0.,4.5,0])
dv
Out[22]:
We can use the SparseVector()
function to create a sparse vector. The first argument is the vector size, the second
argument is a dictionary. The keys are indices of active elements and the values are values of active elements.
In [23]:
sv = SparseVector(6, {0:1.0, 4:4.5})
sv
Out[23]:
In [30]:
DenseVector(sv.toArray())
Out[30]:
In [33]:
active_elements_dict = {index: value for index, value in enumerate(dv) if value != 0}
active_elements_dict
Out[33]:
In [34]:
SparseVector(len(dv), active_elements_dict)
Out[34]:
In [ ]: