Just use threads and make sure the cluster has enough resources to handle both tasks simultaneously.
from threading import Thread
import time
def process(rdd, f):
def delay(x):
time.sleep(1)
return f(x)
return rdd.map(delay).sum()
rdd = sc.parallelize(range(100), int(sc.defaultParallelism / 2))
t1 = Thread(target=process, args=(rdd, lambda x: x * 2))
t2 = Thread(target=process, args=(rdd, lambda x: x + 1))
t1.start(); t2.start()
, , .
FAIR .
pyspark-asyncactions ( - ), Spark API concurrent.futures:
import asyncactions
import concurrent.futures
f1 = rdd.filter(lambda x: x % 3 == 0).countAsync()
f2 = rdd.filter(lambda x: x % 11 == 0).countAsync()
[x.result() for x in concurrent.futures.as_completed([f1, f2])]