keep the original unsplit plan

ccciudatu · ccciudatu · commit 755f00f445b0 · 2025-02-19T14:51:18.000+02:00
diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -18,6 +18,16 @@
 use std::ffi::CString;
 use std::sync::Arc;
 
+use crate::errors::py_datafusion_err;
+use crate::expr::sort_expr::to_sort_expressions;
+use crate::physical_plan::PyExecutionPlan;
+use crate::record_batch::PyRecordBatchStream;
+use crate::sql::logical::PyLogicalPlan;
+use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future};
+use crate::{
+    errors::DataFusionError,
+    expr::{sort_expr::PySortExpr, PyExpr},
+};
 use arrow::array::{new_null_array, RecordBatch, RecordBatchIterator, RecordBatchReader};
 use arrow::compute::can_cast_types;
 use arrow::error::ArrowError;
@@ -31,12 +41,10 @@ use datafusion::common::stats::Precision;
 use datafusion::common::{DFSchema, UnnestOptions};
 use datafusion::config::{ConfigOptions, CsvOptions, TableParquetOptions};
 use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
-use datafusion::datasource::physical_plan::{FileScanConfig, ParquetExec};
-use datafusion::datasource::physical_plan::parquet::ParquetExecBuilder;
 use datafusion::execution::runtime_env::RuntimeEnvBuilder;
 use datafusion::execution::{SendableRecordBatchStream, TaskContext};
 use datafusion::parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
-use datafusion::physical_plan::{displayable, execute_stream, ExecutionPlan};
+use datafusion::physical_plan::{execute_stream, execute_stream_partitioned, ExecutionPlan};
 use datafusion::prelude::*;
 use datafusion_expr::registry::MemoryFunctionRegistry;
 use datafusion_proto::physical_plan::{AsExecutionPlan, PhysicalExtensionCodec};
@@ -48,16 +56,6 @@ use pyo3::prelude::*;
 use pyo3::pybacked::PyBackedStr;
 use pyo3::types::{PyCapsule, PyTuple, PyTupleMethods};
 use tokio::task::JoinHandle;
-use crate::errors::py_datafusion_err;
-use crate::expr::sort_expr::to_sort_expressions;
-use crate::physical_plan::PyExecutionPlan;
-use crate::record_batch::PyRecordBatchStream;
-use crate::sql::logical::PyLogicalPlan;
-use crate::utils::{get_tokio_runtime, validate_pycapsule, wait_for_future};
-use crate::{
-    errors::DataFusionError,
-    expr::{sort_expr::PySortExpr, PyExpr},
-};
 use crate::common::df_schema::PyDFSchema;
 
 /// A PyDataFrame is a representation of a logical plan and an API to compose statements.
@@ -661,145 +659,70 @@ impl PyDataFrame {
         Ok(wait_for_future(py, self.df.as_ref().clone().count())?)
     }
 
-    fn distributed_plan(&self, num_shards: usize, py: Python<'_>) -> PyResult<DistributedPlan> {
-        let distributed_plan = wait_for_future(py, split_physical_plan(&self.df, num_shards))
-            .map_err(py_datafusion_err)?;
-        Ok(distributed_plan)
+    fn distributed_plan(&self, parallelism: usize, py: Python<'_>) -> PyResult<DistributedPlan> {
+        let future_plan = self.df.as_ref().clone().create_physical_plan();
+        let physical_plan = wait_for_future(py, future_plan).map_err(py_datafusion_err)?;
+        DistributedPlan::try_new(physical_plan, parallelism).map_err(py_datafusion_err)
     }
 
 }
 
 #[pyclass(get_all)]
 #[derive(Debug, Clone)]
-pub struct Statistics {
+pub struct DistributedPlan {
+    physical_plan: Vec<u8>,
+    schema: PyDFSchema,
+    partitions: usize,
     num_bytes: Option<usize>,
     num_rows: Option<usize>,
 }
 
-impl Statistics {
-    fn new(plan: &dyn ExecutionPlan) -> Self {
+fn codec() -> &'static dyn PhysicalExtensionCodec {
+    static CODEC: DeltaPhysicalCodec = DeltaPhysicalCodec {};
+    &CODEC
+}
+
+impl DistributedPlan {
+    fn try_new(plan: Arc<dyn ExecutionPlan>, parallelism: usize) -> Result<Self, DataFusionError> {
         fn extract(prec: Precision<usize>) -> Option<usize> {
             match prec {
-                Precision::Exact(n) | Precision::Inexact(n) => Some(n),
-                Precision::Absent => None,
+                Precision::Exact(n) => Some(n),
+                _ => None,
             }
         }
-        if let Ok(stats) = plan.statistics() {
+        let (num_bytes, num_rows) = if let Ok(stats) = plan.statistics() {
             let num_bytes = extract(stats.total_byte_size);
             let num_rows = extract(stats.num_rows);
-            Statistics { num_bytes, num_rows}
+            (num_bytes, num_rows)
         } else {
-            Statistics { num_bytes: None, num_rows: None }
-        }
-    }
-}
-
-#[pyclass(get_all)]
-#[derive(Debug, Clone)]
-pub struct Shard {
-    stats: Statistics,
-    serialized_plan: Vec<u8>,
-}
+            (None, None)
+        };
 
-impl Shard {
-    pub fn try_new(plan: &Arc<dyn ExecutionPlan>) -> Result<Self, DataFusionError> {
-        let stats = Statistics::new(plan.as_ref());
-        let serialized_plan = PhysicalPlanNode::try_from_physical_plan(plan.clone(), Self::codec())?
+        let schema = DFSchema::try_from(plan.schema())
+            .map(PyDFSchema::from)
+            .map_err(py_datafusion_err)?;
+        let plan = plan.repartitioned(parallelism, &ConfigOptions::default())
+            .map_err(py_datafusion_err)?
+            .unwrap_or(plan);
+        let partitions = plan.properties().partitioning.partition_count();
+        let physical_plan = PhysicalPlanNode::try_from_physical_plan(plan, codec())?
             .encode_to_vec();
-        Ok(Self { stats, serialized_plan })
-    }
-
-    fn codec() -> &'static dyn PhysicalExtensionCodec {
-        static CODEC: DeltaPhysicalCodec = DeltaPhysicalCodec {};
-        &CODEC
+        Ok(Self { physical_plan, schema, partitions, num_bytes, num_rows })
     }
-}
 
-#[pyclass(get_all)]
-#[derive(Debug, Clone)]
-pub struct DistributedPlan {
-    shards: Vec<Shard>,
-    schema: PyDFSchema,
-    stats: Statistics,
-}
-
-async fn split_physical_plan(df: &DataFrame, num_shards: usize) -> Result<DistributedPlan, DataFusionError> {
-    fn split(plan: &Arc<dyn ExecutionPlan>, num_shards: usize) -> Vec<Arc<dyn ExecutionPlan>> {
-        if let Some(parquet) =  plan.as_any().downcast_ref::<ParquetExec>() {
-            let parquet = if let Ok(Some(repartitioned)) = parquet.repartitioned(num_shards, &ConfigOptions::default()) {
-                repartitioned.as_any().downcast_ref::<ParquetExec>()
-                    .expect("repartitioned parquet is no longer parquet")
-                    .clone()
-            } else { // repartition failed
-                parquet.clone()
-            };
-            let config = parquet.base_config();
-            config
-                .file_groups
-                .iter()
-                .map(|shard| {
-                    FileScanConfig {
-                        object_store_url: config.object_store_url.clone(),
-                        file_schema: config.file_schema.clone(),
-                        file_groups: shard.iter().map(|file| vec![file.to_owned()]).collect(), // one partition per file
-                        statistics: config.statistics.clone(),
-                        projection: config.projection.clone(),
-                        projection_deep: config.projection_deep.clone(),
-                        limit: config.limit,
-                        table_partition_cols: config.table_partition_cols.clone(),
-                        output_ordering: config.output_ordering.clone(),
-                    }
-                })
-                .map(|config| {
-                    let mut builder = ParquetExecBuilder::new(config)
-                        .with_table_parquet_options(parquet.table_parquet_options().clone());
-                    if let Some(predicate) = parquet.predicate() {
-                        builder = builder.with_predicate(predicate.clone());
-                    }
-                    builder.build_arc()
-                })
-                .map(|shard| shard as Arc<dyn ExecutionPlan>)
-                .collect()
-        } else if plan.children().len() == 0 { // TODO: split leaf nodes other than parquet?
-            vec![plan.clone()]
-        } else if plan.children().len() == 1 {
-            plan.children().into_iter()
-                .flat_map(|child| {
-                    split(child, num_shards)
-                        .into_iter()
-                        .map(|shard| plan.clone().with_new_children(vec![shard]))
-                })
-                .collect::<Result<Vec<_>, _>>()
-                .expect("Unable to split plan")
-        } else {
-            panic!(
-                "Only leaf or single-child plans are supported, found {}",
-                displayable(plan.as_ref()).one_line()
-            )
-        }
-    }
-    let plan = df.clone().create_physical_plan().await?;
-    let shards = split(&plan, num_shards)
-        .iter()
-        .map(Shard::try_new)
-        .collect::<Result<Vec<_>, _>>()?;
-    let schema = DFSchema::try_from(plan.schema().as_ref().to_owned())?.into();
-    let stats = Statistics::new(plan.as_ref());
-    Ok(DistributedPlan { shards, schema, stats })
 }
 
 #[pyfunction]
-pub fn shard_stream(serialized_shard_plan: &[u8], py: Python) -> PyResult<PyRecordBatchStream> {
+pub fn partition_stream(serialized_plan: &[u8], partition: usize, py: Python) -> PyResult<PyRecordBatchStream> {
     deltalake::ensure_initialized();
     let registry = MemoryFunctionRegistry::default();
     let runtime = RuntimeEnvBuilder::new().build()?;
-    let codec = DeltaPhysicalCodec {};
-    let node = PhysicalPlanNode::decode(serialized_shard_plan)
+    let node = PhysicalPlanNode::decode(serialized_plan)
         .map_err(|e| datafusion::error::DataFusionError::External(Box::new(e)))
         .map_err(py_datafusion_err)?;
-    let plan = node.try_into_physical_plan(&registry, &runtime, &codec)?;
+    let plan = node.try_into_physical_plan(&registry, &runtime, codec())?;
     let stream_with_runtime = get_tokio_runtime().0.spawn(async move {
-        execute_stream(plan, Arc::new(TaskContext::default()))
+        plan.execute(partition, Arc::default())
     });
     wait_for_future(py, stream_with_runtime)
         .map_err(py_datafusion_err)?
diff --git a/src/lib.rs b/src/lib.rs
@@ -115,9 +115,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {
     #[cfg(feature = "substrait")]
     setup_substrait_module(py, &m)?;
 
-    m.add_class::<dataframe::Shard>()?;
     m.add_class::<dataframe::DistributedPlan>()?;
-    m.add_wrapped(wrap_pyfunction!(dataframe::shard_stream))?;
+    m.add_wrapped(wrap_pyfunction!(dataframe::partition_stream))?;
     Ok(())
 }
 

Original file line number	Diff line number	Diff line change
`@@ -115,9 +115,8 @@ fn _internal(py: Python, m: Bound<'_, PyModule>) -> PyResult<()> {`
`115`	`115`	`#[cfg(feature = "substrait")]`
`116`	`116`	`setup_substrait_module(py, &m)?;`
`117`	`117`
`118`		`- m.add_class::<dataframe::Shard>()?;`
`119`	`118`	`m.add_class::<dataframe::DistributedPlan>()?;`
`120`		`- m.add_wrapped(wrap_pyfunction!(dataframe::shard_stream))?;`
	`119`	`+ m.add_wrapped(wrap_pyfunction!(dataframe::partition_stream))?;`
`121`	`120`	`Ok(())`
`122`	`121`	`}`
`123`	`122`