Skip to content

Commit a16521c

Browse files
mwiewiorclaude
andcommitted
feat: Add coordinate_system_zero_based parameter to all table providers
This commit implements the coordinate system parameter for all bio format table providers (VCF, GFF, BAM, BED, CRAM) to support both 0-based and 1-based coordinate output. Changes: - Add `coordinate_system_zero_based: bool` parameter to all TableProvider constructors - Store coordinate system preference in Arrow schema metadata with key `bio.coordinate_system_zero_based` - Add `COORDINATE_SYSTEM_METADATA_KEY` constant in bio-format-core - Update position conversion in physical_exec.rs for each format: - When true (default): subtract 1 from noodles 1-based positions - When false: use noodles positions as-is (1-based) - Update all binaries, examples, and tests to pass the new parameter - Fix test assertions to expect 0-based coordinates (default) Breaking Change: All TableProvider::new() constructors now require an additional bool parameter as the last argument. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 2cdd804 commit a16521c

39 files changed

+1071
-99
lines changed

.github/workflows/benchmark.yml.bak

Lines changed: 651 additions & 0 deletions
Large diffs are not rendered by default.

benchmarks/runner/src/main.rs

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
use anyhow::{Context, Result};
22
use datafusion::prelude::*;
33
use datafusion_bio_benchmarks_common::{
4-
extract_drive_id, write_result, BenchmarkCategory, BenchmarkResultBuilder, DataDownloader,
5-
TestDataFile,
4+
BenchmarkCategory, BenchmarkResultBuilder, DataDownloader, TestDataFile, extract_drive_id,
5+
write_result,
66
};
77
use datafusion_bio_format_core::object_storage::ObjectStorageOptions;
88
use serde::Deserialize;
@@ -210,16 +210,22 @@ async fn register_table(
210210
"gff" => {
211211
let storage_options = ObjectStorageOptions::default();
212212
use datafusion_bio_format_gff::table_provider::GffTableProvider;
213-
let provider =
214-
GffTableProvider::new(file_path.to_string(), None, None, Some(storage_options))
215-
.context("Failed to create GFF table provider")?;
213+
let provider = GffTableProvider::new(
214+
file_path.to_string(),
215+
None,
216+
None,
217+
Some(storage_options),
218+
true,
219+
)
220+
.context("Failed to create GFF table provider")?;
216221
ctx.register_table(table_name, std::sync::Arc::new(provider))
217222
.context("Failed to register GFF table")?;
218223
}
219224
"vcf" => {
220225
use datafusion_bio_format_vcf::table_provider::VcfTableProvider;
221-
let provider = VcfTableProvider::new(file_path.to_string(), None, None, None, None)
222-
.context("Failed to create VCF table provider")?;
226+
let provider =
227+
VcfTableProvider::new(file_path.to_string(), None, None, None, None, true)
228+
.context("Failed to create VCF table provider")?;
223229
ctx.register_table(table_name, std::sync::Arc::new(provider))
224230
.context("Failed to register VCF table")?;
225231
}
@@ -232,7 +238,7 @@ async fn register_table(
232238
}
233239
"bam" => {
234240
use datafusion_bio_format_bam::table_provider::BamTableProvider;
235-
let provider = BamTableProvider::new(file_path.to_string(), None, None)
241+
let provider = BamTableProvider::new(file_path.to_string(), None, None, true)
236242
.context("Failed to create BAM table provider")?;
237243
ctx.register_table(table_name, std::sync::Arc::new(provider))
238244
.context("Failed to register BAM table")?;
@@ -241,7 +247,7 @@ async fn register_table(
241247
use datafusion_bio_format_bed::table_provider::{BEDFields, BedTableProvider};
242248
// Default to BED3 format (chrom, start, end)
243249
let provider =
244-
BedTableProvider::new(file_path.to_string(), BEDFields::BED3, None, None)
250+
BedTableProvider::new(file_path.to_string(), BEDFields::BED3, None, None, true)
245251
.context("Failed to create BED table provider")?;
246252
ctx.register_table(table_name, std::sync::Arc::new(provider))
247253
.context("Failed to register BED table")?;

datafusion/bio-format-bam/examples/test_bam_reader.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
1414
concurrent_fetches: Some(1), // Number of concurrent requests
1515
compression_type: None,
1616
};
17-
let table =
18-
BamTableProvider::new(file_path.clone(), Some(1), Some(object_storage_options)).unwrap();
17+
let table = BamTableProvider::new(
18+
file_path.clone(),
19+
Some(1),
20+
Some(object_storage_options),
21+
true,
22+
)
23+
.unwrap();
1924

2025
let ctx = datafusion::execution::context::SessionContext::new();
2126
ctx.sql("set datafusion.execution.skip_physical_aggregate_schema_check=true")

datafusion/bio-format-bam/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
//! let ctx = SessionContext::new();
2323
//!
2424
//! // Register a BAM file as a table
25-
//! let table = BamTableProvider::new("data/alignments.bam".to_string(), None, None)?;
25+
//! let table = BamTableProvider::new("data/alignments.bam".to_string(), None, None, true)?;
2626
//! ctx.register_table("alignments", Arc::new(table))?;
2727
//!
2828
//! // Query with SQL

datafusion/bio-format-bam/src/physical_exec.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ pub struct BamExec {
3030
pub(crate) limit: Option<usize>,
3131
pub(crate) thread_num: Option<usize>,
3232
pub(crate) object_storage_options: Option<ObjectStorageOptions>,
33+
/// If true, output 0-based half-open coordinates; if false, 1-based closed coordinates
34+
pub(crate) coordinate_system_zero_based: bool,
3335
}
3436

3537
impl Debug for BamExec {
@@ -84,6 +86,7 @@ impl ExecutionPlan for BamExec {
8486
self.thread_num,
8587
self.projection.clone(),
8688
self.object_storage_options.clone(),
89+
self.coordinate_system_zero_based,
8790
);
8891
let stream = futures::stream::once(fut).try_flatten();
8992
Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
@@ -95,6 +98,7 @@ async fn get_remote_bam_stream(
9598
batch_size: usize,
9699
projection: Option<Vec<usize>>,
97100
object_storage_options: Option<ObjectStorageOptions>,
101+
coordinate_system_zero_based: bool,
98102
) -> datafusion::error::Result<
99103
AsyncStream<datafusion::error::Result<RecordBatch>, impl Future<Output = ()> + Sized>,
100104
> {
@@ -140,7 +144,9 @@ async fn get_remote_bam_stream(
140144
chrom.push(chrom_name);
141145
match record.alignment_start() {
142146
Some(start_pos) => {
143-
start.push(Some(start_pos?.get() as u32));
147+
// noodles normalizes all positions to 1-based; subtract 1 for 0-based output
148+
let pos = start_pos?.get() as u32;
149+
start.push(Some(if coordinate_system_zero_based { pos - 1 } else { pos }));
144150
},
145151
None => {
146152
start.push(None);
@@ -180,7 +186,11 @@ async fn get_remote_bam_stream(
180186
);
181187
mate_chrom.push(chrom_name);
182188
match record.mate_alignment_start() {
183-
Some(start) => mate_start.push(Some(start?.get() as u32)),
189+
Some(mate_start_pos) => {
190+
// noodles normalizes all positions to 1-based; subtract 1 for 0-based output
191+
let pos = mate_start_pos?.get() as u32;
192+
mate_start.push(Some(if coordinate_system_zero_based { pos - 1 } else { pos }));
193+
},
184194
_ => mate_start.push(None),
185195
};
186196

@@ -252,6 +262,7 @@ async fn get_local_bam(
252262
batch_size: usize,
253263
thread_num: Option<usize>,
254264
projection: Option<Vec<usize>>,
265+
coordinate_system_zero_based: bool,
255266
) -> datafusion::error::Result<impl futures::Stream<Item = datafusion::error::Result<RecordBatch>>>
256267
{
257268
let mut name: Vec<Option<String>> = Vec::with_capacity(batch_size);
@@ -298,7 +309,9 @@ async fn get_local_bam(
298309
};
299310
match record.alignment_start() {
300311
Some(start_pos) => {
301-
start.push(Some(start_pos?.get() as u32));
312+
// noodles normalizes all positions to 1-based; subtract 1 for 0-based output
313+
let pos = start_pos?.get() as u32;
314+
start.push(Some(if coordinate_system_zero_based { pos - 1 } else { pos }));
302315
},
303316
None => {
304317
start.push(None);
@@ -335,7 +348,11 @@ async fn get_local_bam(
335348
);
336349
mate_chrom.push(chrom_name);
337350
match record.mate_alignment_start() {
338-
Some(start) => mate_start.push(Some(start?.get() as u32)),
351+
Some(mate_start_pos) => {
352+
// noodles normalizes all positions to 1-based; subtract 1 for 0-based output
353+
let pos = mate_start_pos?.get() as u32;
354+
mate_start.push(Some(if coordinate_system_zero_based { pos - 1 } else { pos }));
355+
},
339356
None => mate_start.push(None),
340357
};
341358

@@ -484,6 +501,7 @@ async fn get_stream(
484501
thread_num: Option<usize>,
485502
projection: Option<Vec<usize>>,
486503
object_storage_options: Option<ObjectStorageOptions>,
504+
coordinate_system_zero_based: bool,
487505
) -> datafusion::error::Result<SendableRecordBatchStream> {
488506
// Open the BGZF-indexed VCF using IndexedReader.
489507

@@ -499,6 +517,7 @@ async fn get_stream(
499517
batch_size,
500518
thread_num,
501519
projection,
520+
coordinate_system_zero_based,
502521
)
503522
.await?;
504523
Ok(Box::pin(RecordBatchStreamAdapter::new(schema_ref, stream)))
@@ -510,6 +529,7 @@ async fn get_stream(
510529
batch_size,
511530
projection,
512531
object_storage_options,
532+
coordinate_system_zero_based,
513533
)
514534
.await?;
515535
Ok(Box::pin(RecordBatchStreamAdapter::new(schema_ref, stream)))

datafusion/bio-format-bam/src/table_provider.rs

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ use datafusion::physical_plan::{
99
ExecutionPlan, PlanProperties,
1010
execution_plan::{Boundedness, EmissionType},
1111
};
12+
use datafusion_bio_format_core::COORDINATE_SYSTEM_METADATA_KEY;
1213
use datafusion_bio_format_core::object_storage::ObjectStorageOptions;
1314
use log::debug;
1415
use std::any::Any;
16+
use std::collections::HashMap;
1517
use std::sync::Arc;
1618

17-
fn determine_schema() -> datafusion::common::Result<SchemaRef> {
19+
fn determine_schema(coordinate_system_zero_based: bool) -> datafusion::common::Result<SchemaRef> {
1820
let fields = vec![
1921
Field::new("name", DataType::Utf8, true),
2022
Field::new("chrom", DataType::Utf8, true),
@@ -28,7 +30,13 @@ fn determine_schema() -> datafusion::common::Result<SchemaRef> {
2830
Field::new("sequence", DataType::Utf8, false),
2931
Field::new("quality_scores", DataType::Utf8, false),
3032
];
31-
let schema = Schema::new(fields);
33+
// Add coordinate system metadata to schema
34+
let mut metadata = HashMap::new();
35+
metadata.insert(
36+
COORDINATE_SYSTEM_METADATA_KEY.to_string(),
37+
coordinate_system_zero_based.to_string(),
38+
);
39+
let schema = Schema::new_with_metadata(fields, metadata);
3240
debug!("Schema: {:?}", schema);
3341
Ok(Arc::new(schema))
3442
}
@@ -48,6 +56,8 @@ pub struct BamTableProvider {
4856
thread_num: Option<usize>,
4957
/// Configuration for cloud storage access
5058
object_storage_options: Option<ObjectStorageOptions>,
59+
/// If true, output 0-based half-open coordinates; if false, 1-based closed coordinates
60+
coordinate_system_zero_based: bool,
5161
}
5262

5363
impl BamTableProvider {
@@ -58,6 +68,8 @@ impl BamTableProvider {
5868
/// * `file_path` - Path to the BAM file (local or remote URL)
5969
/// * `thread_num` - Optional number of threads for BGZF decompression
6070
/// * `object_storage_options` - Optional configuration for cloud storage access
71+
/// * `coordinate_system_zero_based` - If true (default), output 0-based half-open coordinates;
72+
/// if false, output 1-based closed coordinates
6173
///
6274
/// # Returns
6375
///
@@ -73,6 +85,7 @@ impl BamTableProvider {
7385
/// "data/alignments.bam".to_string(),
7486
/// Some(4), // Use 4 threads
7587
/// None, // No cloud storage
88+
/// true, // Use 0-based coordinates (default)
7689
/// )?;
7790
/// # Ok(())
7891
/// # }
@@ -81,13 +94,15 @@ impl BamTableProvider {
8194
file_path: String,
8295
thread_num: Option<usize>,
8396
object_storage_options: Option<ObjectStorageOptions>,
97+
coordinate_system_zero_based: bool,
8498
) -> datafusion::common::Result<Self> {
85-
let schema = determine_schema()?;
99+
let schema = determine_schema(coordinate_system_zero_based)?;
86100
Ok(Self {
87101
file_path,
88102
schema,
89103
thread_num,
90104
object_storage_options,
105+
coordinate_system_zero_based,
91106
})
92107
}
93108
}
@@ -120,12 +135,19 @@ impl TableProvider for BamTableProvider {
120135
fn project_schema(schema: &SchemaRef, projection: Option<&Vec<usize>>) -> SchemaRef {
121136
match projection {
122137
Some(indices) if indices.is_empty() => {
123-
Arc::new(Schema::new(vec![Field::new("dummy", DataType::Null, true)]))
138+
// For empty projections (COUNT(*)), use a dummy field with preserved metadata
139+
Arc::new(Schema::new_with_metadata(
140+
vec![Field::new("dummy", DataType::Null, true)],
141+
schema.metadata().clone(),
142+
))
124143
}
125144
Some(indices) => {
126145
let projected_fields: Vec<Field> =
127146
indices.iter().map(|&i| schema.field(i).clone()).collect();
128-
Arc::new(Schema::new(projected_fields))
147+
Arc::new(Schema::new_with_metadata(
148+
projected_fields,
149+
schema.metadata().clone(),
150+
))
129151
}
130152
None => schema.clone(),
131153
}
@@ -146,6 +168,7 @@ impl TableProvider for BamTableProvider {
146168
limit,
147169
thread_num: self.thread_num,
148170
object_storage_options: self.object_storage_options.clone(),
171+
coordinate_system_zero_based: self.coordinate_system_zero_based,
149172
}))
150173
}
151174
}

datafusion/bio-format-bed/examples/test_bed_reader.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
3232
BEDFields::BED4,
3333
Some(1),
3434
Some(object_storage_options),
35+
true, // Use 0-based coordinates (default)
3536
)
3637
.unwrap();
3738
let ctx = datafusion::execution::context::SessionContext::new();

datafusion/bio-format-bed/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
//! let ctx = SessionContext::new();
2222
//!
2323
//! // Register a BED file as a table
24-
//! let table = BedTableProvider::new("data/genes.bed".to_string(), BEDFields::BED3, None, None)?;
24+
//! let table = BedTableProvider::new("data/genes.bed".to_string(), BEDFields::BED3, None, None, true)?;
2525
//! ctx.register_table("genes", Arc::new(table))?;
2626
//!
2727
//! // Query with SQL

0 commit comments

Comments
 (0)