There is still an inelegant solution using batteries. The inefficiency lies in the fact that you have to determine the zero / initial values in front of the hand so that they do not interfere with the data:
from pyspark.accumulators import AccumulatorParam
class MinMaxAccumulatorParam(AccumulatorParam):
def zero(self, value):
return value
def addInPlace(self, val1, val2):
return(min(val1[0],val2[0]), max(val1[1],val2[1]))
minmaxAccu = sc.accumulator([500,-500], MinMaxAccumulatorParam())
def g(x):
global minmaxAccu
minmaxAccu += (x,x)
rdd = sc.parallelize([1, 2, 3, 4, 5])
rdd.foreach(g)
In [149]: minmaxAccu.value
Out[149]: (1, 5)
source
share