. , Spark , , . binaryFiles Python. :
import tempfile
import pandas as pd
import numpy as np
outdir = tempfile.mkdtemp()
for i in range(5):
pd.DataFrame(
np.random.randn(10, 2), columns=['foo', 'bar']
).to_pickle(tempfile.mkstemp(dir=outdir)[1])
bianryFiles:
rdd = sc.binaryFiles(outdir)
:
import pickle
from io import BytesIO
dfs = rdd.values().map(lambda p: pickle.load(BytesIO(p)))
dfs.first()[:3]
, , , textFile.
, , hdfs3. .
, , .
:
SparkContext pickleFile, . SequenceFiles, , .