, , , . , sqlContext.createDataFrame .
2000 ?
Spark 2000 , 2000 . ( , , , ).
:
>>> df.rdd.getNumPartitions()
2000
DataFrame 2000 ?
, sqlContext.createDataFrame (2000 ), , .
.
sql/context.py sqlContext.createDataFrame ( ):
rdd, schema = self._createFromLocal(data, schema)
:
return self._sc.parallelize(data), schema
sqlContext.parallelize context.py:
numSlices = int(numSlices) if numSlices is not None else self.defaultParallelism
, sqlContext.createDataFrame.
DataFrame?
DataFrame.coalesce.
>>> smdf = df.coalesce(1)
>>> smdf.rdd.getNumPartitions()
1
>>> smdf.explain()
== Physical Plan ==
Coalesce 1
+- Scan ExistingRDD[a
>>> smdf.collect()
[Row(a=1, b=2)]