@@ -2,37 +2,25 @@ use std::any::Any;
22use std:: fmt:: { Debug , Formatter } ;
33use std:: sync:: Arc ;
44
5- use datafusion:: arrow:: array:: { Array , Float64Array , Int32Array , NullArray , StringArray , UInt32Array } ;
6- use datafusion:: arrow:: datatypes:: { DataType , Field , SchemaRef } ;
5+ use datafusion:: arrow:: array:: { Array , Float64Array , NullArray , StringArray , UInt32Array } ;
6+ use datafusion:: arrow:: datatypes:: { DataType , SchemaRef } ;
77use datafusion:: arrow:: record_batch:: RecordBatch ;
88use datafusion:: common:: DataFusionError ;
99use datafusion:: physical_plan:: { DisplayAs , DisplayFormatType , ExecutionPlan , PlanProperties } ;
1010use datafusion_execution:: { SendableRecordBatchStream , TaskContext } ;
1111use futures:: { stream, StreamExt , TryStreamExt } ;
12- use std:: { io, str} ;
13- use std:: fs:: File ;
14- use std:: hash:: Hasher ;
15- use std:: io:: Error ;
16- use std:: num:: NonZero ;
17- use std:: ops:: Deref ;
12+ use std:: str;
1813use std:: time:: Instant ;
1914use async_stream:: __private:: AsyncStream ;
2015use async_stream:: try_stream;
21- use datafusion:: arrow:: ipc:: FieldBuilder ;
2216use datafusion:: physical_plan:: stream:: RecordBatchStreamAdapter ;
23- use env_logger:: builder;
24- use log:: { debug, info} ;
25- use noodles:: vcf;
17+ use log:: debug;
2618use noodles:: vcf:: Header ;
2719use noodles:: vcf:: header:: Infos ;
28- use noodles:: vcf:: header:: record:: value:: map:: Info ;
29- use noodles:: vcf:: header:: record:: value:: map:: info:: { Number , Type } ;
30- use noodles:: vcf:: io:: Reader ;
3120use noodles:: vcf:: variant:: Record ;
3221use noodles:: vcf:: variant:: record:: { AlternateBases , Filters , Ids , ReferenceBases } ;
3322use noodles:: vcf:: variant:: record:: info:: field:: { Value , value:: Array as ValueArray } ;
34- use noodles_bgzf:: MultithreadedReader ;
35- use crate :: storage:: { get_compression_type, get_local_vcf_bgzf_reader, get_remote_stream_bgzf, get_remote_vcf_bgzf_reader, get_remote_vcf_header, get_remote_vcf_reader, get_storage_type, CompressionType , StorageType , VcfRemoteReader } ;
23+ use crate :: storage:: { get_local_vcf_bgzf_reader, get_storage_type, StorageType , VcfRemoteReader } ;
3624use crate :: table_provider:: { info_to_arrow_type, OptionalField } ;
3725
3826fn build_record_batch (
@@ -192,7 +180,6 @@ async fn get_local_vcf(file_path: String, schema_ref: SchemaRef,
192180 let mut filters: Vec < String > = Vec :: with_capacity ( batch_size) ;
193181
194182 let mut count: usize = 0 ;
195- let mut record_num = 0 ;
196183 let mut batch_num = 0 ;
197184 let schema = Arc :: clone ( & schema_ref) ;
198185 let file_path = file_path. clone ( ) ;
@@ -210,7 +197,6 @@ async fn get_local_vcf(file_path: String, schema_ref: SchemaRef,
210197 let iter_start_time = Instant :: now ( ) ;
211198 while count < batch_size {
212199 let record = records. next ( ) ;
213- record_num += 1 ;
214200 if record. is_none ( ) {
215201 break ;
216202 }
@@ -262,8 +248,8 @@ async fn get_local_vcf(file_path: String, schema_ref: SchemaRef,
262248
263249async fn get_remote_vcf_stream ( file_path : String , schema : SchemaRef ,
264250 batch_size : usize ,
265- info_fields : Option < Vec < String > > , projection : Option < Vec < usize > > ) -> datafusion:: error:: Result < AsyncStream < datafusion:: error:: Result < RecordBatch > , impl Future < Output =( ) > + Sized > > {
266- let mut reader = VcfRemoteReader :: new ( file_path. clone ( ) ) . await ;
251+ info_fields : Option < Vec < String > > , projection : Option < Vec < usize > > , chunk_size : usize , concurrent_fetches : usize ) -> datafusion:: error:: Result < AsyncStream < datafusion:: error:: Result < RecordBatch > , impl Future < Output =( ) > + Sized > > {
252+ let mut reader = VcfRemoteReader :: new ( file_path. clone ( ) , chunk_size , concurrent_fetches ) . await ;
267253 let header = reader. read_header ( ) . await ?;
268254 let infos = header. infos ( ) ;
269255 let mut info_builders: ( Vec < String > , Vec < DataType > , Vec < OptionalField > ) = ( Vec :: new ( ) , Vec :: new ( ) , Vec :: new ( ) ) ;
@@ -370,7 +356,7 @@ fn set_info_builders(batch_size: usize, info_fields: Option<Vec<String>>, infos:
370356
371357async fn get_stream ( file_path : String , schema_ref : SchemaRef , batch_size : usize ,
372358 thread_num : Option < usize > ,
373- info_fields : Option < Vec < String > > , projection : Option < Vec < usize > > ) -> datafusion:: error:: Result < SendableRecordBatchStream > {
359+ info_fields : Option < Vec < String > > , projection : Option < Vec < usize > > , chunk_size : usize , concurrent_fetches : usize ) -> datafusion:: error:: Result < SendableRecordBatchStream > {
374360 // Open the BGZF-indexed VCF using IndexedReader.
375361
376362 let file_path = file_path. clone ( ) ;
@@ -383,7 +369,7 @@ async fn get_stream(file_path: String, schema_ref: SchemaRef, batch_size: usize,
383369 Ok ( Box :: pin ( RecordBatchStreamAdapter :: new ( schema_ref, stream) ) )
384370 } ,
385371 StorageType :: GCS | StorageType :: S3 => {
386- let stream = get_remote_vcf_stream ( file_path. clone ( ) , schema. clone ( ) , batch_size, info_fields, projection) . await ?;
372+ let stream = get_remote_vcf_stream ( file_path. clone ( ) , schema. clone ( ) , batch_size, info_fields, projection, chunk_size , concurrent_fetches ) . await ?;
387373 Ok ( Box :: pin ( RecordBatchStreamAdapter :: new ( schema_ref, stream) ) )
388374 } ,
389375 _ => panic ! ( "Unsupported storage type" )
@@ -402,6 +388,8 @@ pub struct VcfExec {
402388 pub ( crate ) cache : PlanProperties ,
403389 pub ( crate ) limit : Option < usize > ,
404390 pub ( crate ) thread_num : Option < usize > ,
391+ pub ( crate ) chunk_size : Option < usize > ,
392+ pub ( crate ) concurrent_fetches : Option < usize > ,
405393}
406394
407395
@@ -415,6 +403,8 @@ impl VcfExec {
415403 cache : PlanProperties ,
416404 limit : Option < usize > ,
417405 thread_num : Option < usize > ,
406+ chunk_size : Option < usize > ,
407+ concurrent_fetches : Option < usize > ,
418408 ) -> Self {
419409 debug ! ( "VcfExec::new" ) ;
420410 Self {
@@ -426,21 +416,23 @@ impl VcfExec {
426416 cache,
427417 limit,
428418 thread_num,
419+ chunk_size,
420+ concurrent_fetches
429421 }
430422 }
431423}
432424
433425
434426impl Debug for VcfExec {
435- fn fmt ( & self , f : & mut Formatter < ' _ > ) -> std:: fmt:: Result {
427+ fn fmt ( & self , _f : & mut Formatter < ' _ > ) -> std:: fmt:: Result {
436428 Ok ( ( ) )
437429 }
438430}
439431
440432
441433
442434impl DisplayAs for VcfExec {
443- fn fmt_as ( & self , t : DisplayFormatType , f : & mut Formatter ) -> std:: fmt:: Result {
435+ fn fmt_as ( & self , _t : DisplayFormatType , _f : & mut Formatter ) -> std:: fmt:: Result {
444436 Ok ( ( ) )
445437 }
446438
@@ -464,7 +456,7 @@ impl ExecutionPlan for VcfExec {
464456 vec ! [ ]
465457 }
466458
467- fn with_new_children ( self : Arc < Self > , children : Vec < Arc < dyn ExecutionPlan > > ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
459+ fn with_new_children ( self : Arc < Self > , _children : Vec < Arc < dyn ExecutionPlan > > ) -> datafusion:: common:: Result < Arc < dyn ExecutionPlan > > {
468460 Ok ( self )
469461 }
470462
@@ -479,7 +471,9 @@ impl ExecutionPlan for VcfExec {
479471 schema. clone ( ) ,
480472 batch_size, self . thread_num ,
481473 self . info_fields . clone ( ) ,
482- self . projection . clone ( ) ) ;
474+ self . projection . clone ( ) ,
475+ self . chunk_size . unwrap_or ( 64 ) ,
476+ self . concurrent_fetches . unwrap_or ( 8 ) ) ;
483477 let stream = futures:: stream:: once ( fut) . try_flatten ( ) ;
484478 Ok ( Box :: pin ( RecordBatchStreamAdapter :: new ( schema, stream) ) )
485479
0 commit comments