@@ -20,7 +20,7 @@ use noodles::vcf::header::Infos;
2020use noodles:: vcf:: variant:: Record ;
2121use noodles:: vcf:: variant:: record:: { AlternateBases , Filters , Ids , ReferenceBases } ;
2222use noodles:: vcf:: variant:: record:: info:: field:: { Value , value:: Array as ValueArray } ;
23- use crate :: storage:: { get_local_vcf_bgzf_reader, get_storage_type, StorageType , VcfRemoteReader } ;
23+ use crate :: storage:: { get_local_vcf_bgzf_reader, get_storage_type, StorageType , VcfLocalReader , VcfRemoteReader } ;
2424use crate :: table_provider:: { info_to_arrow_type, OptionalField } ;
2525
2626fn build_record_batch (
@@ -184,64 +184,81 @@ async fn get_local_vcf(file_path: String, schema_ref: SchemaRef,
184184 let schema = Arc :: clone ( & schema_ref) ;
185185 let file_path = file_path. clone ( ) ;
186186 let thread_num = thread_num. unwrap_or ( 1 ) ;
187- let mut reader = get_local_vcf_bgzf_reader ( file_path, thread_num) ? ;
188- let header = reader. read_header ( ) ?;
187+ let mut reader = VcfLocalReader :: new ( file_path. clone ( ) , thread_num) . await ;
188+ let header = reader. read_header ( ) . await ?;
189189 let infos = header. infos ( ) ;
190-
190+ let mut record_num = 0 ;
191191 let mut info_builders: ( Vec < String > , Vec < DataType > , Vec < OptionalField > ) = ( Vec :: new ( ) , Vec :: new ( ) , Vec :: new ( ) ) ;
192192 set_info_builders ( batch_size, info_fields, & infos, & mut info_builders) ;
193193
194- let iter = std :: iter :: from_fn ( move || {
194+ let stream = try_stream ! {
195195
196- let mut records = reader. records ( ) ;
196+ let mut records = reader. read_records ( ) ;
197197 let iter_start_time = Instant :: now( ) ;
198- while count < batch_size {
199- let record = records. next ( ) ;
200- if record. is_none ( ) {
201- break ;
202- }
203- let record = record. unwrap ( ) . unwrap ( ) ;
204- // For each record, fill the fixed columns.
198+ while let Some ( result) = records. next( ) . await {
199+ let record = result?; // propagate errors if any
205200 chroms. push( record. reference_sequence_name( ) . to_string( ) ) ;
206- poss. push ( record. variant_start ( ) . unwrap ( ) . unwrap ( ) . get ( ) as u32 ) ;
201+ poss. push( record. variant_start( ) . unwrap( ) ? . get( ) as u32 ) ;
207202 pose. push( get_variant_end( & record, & header) ) ;
208203 ids. push( record. ids( ) . iter( ) . map( |v| v. to_string( ) ) . collect:: <Vec <String >>( ) . join( ";" ) ) ;
209204 refs. push( record. reference_bases( ) . to_string( ) ) ;
210205 alts. push( record. alternate_bases( ) . iter( ) . map( |v| v. unwrap_or( "." ) . to_string( ) ) . collect:: <Vec <String >>( ) . join( "|" ) ) ;
211206 quals. push( record. quality_score( ) . unwrap_or( Ok ( 0.0 ) ) . unwrap( ) as f64 ) ;
212207 filters. push( record. filters( ) . iter( & header) . map( |v| v. unwrap_or( "." ) . to_string( ) ) . collect:: <Vec <String >>( ) . join( ";" ) ) ;
213208 load_infos( Box :: new( record) , & header, & mut info_builders) ;
214- count += 1 ;
209+ record_num += 1 ;
210+ // Once the batch size is reached, build and yield a record batch.
211+ if record_num % batch_size == 0 {
212+ debug!( "Record number: {}" , record_num) ;
213+ let batch = build_record_batch(
214+ Arc :: clone( & schema. clone( ) ) ,
215+ & chroms,
216+ & poss,
217+ & pose,
218+ & ids,
219+ & refs,
220+ & alts,
221+ & quals,
222+ & filters,
223+ Some ( & builders_to_arrays( & mut info_builders. 2 ) ) , projection. clone( ) ,
224+ // if infos.is_empty() { None } else { Some(&infos) },
225+
226+ ) ?;
227+ batch_num += 1 ;
228+ debug!( "Batch number: {}" , batch_num) ;
229+ yield batch;
230+ // Clear vectors for the next batch.
231+ chroms. clear( ) ;
232+ poss. clear( ) ;
233+ pose. clear( ) ;
234+ ids. clear( ) ;
235+ refs. clear( ) ;
236+ alts. clear( ) ;
237+ quals. clear( ) ;
238+ filters. clear( ) ;
239+
240+ }
215241 }
216- if count == 0 {
217- return None ;
242+ // If there are remaining records that don't fill a complete batch,
243+ // yield them as well.
244+ if !chroms. is_empty( ) {
245+ let batch = build_record_batch(
246+ Arc :: clone( & schema. clone( ) ) ,
247+ & chroms,
248+ & poss,
249+ & pose,
250+ & ids,
251+ & refs,
252+ & alts,
253+ & quals,
254+ & filters,
255+ Some ( & builders_to_arrays( & mut info_builders. 2 ) ) , projection. clone( ) ,
256+ // if infos.is_empty() { None } else { Some(&infos) },
257+ ) ?;
258+ yield batch;
218259 }
219- let duration = iter_start_time. elapsed ( ) ;
220- debug ! ( "Time elapsed in iterating records: {:?}" , duration) ;
221- debug ! ( "Batch number: {}" , batch_num) ;
222- let start_time = Instant :: now ( ) ;
223- let batch = build_record_batch ( Arc :: clone ( & schema) ,
224- & chroms, & poss, & pose,
225- & ids, & refs, & alts,
226- & quals, & filters,
227- Some ( & builders_to_arrays ( & mut info_builders. 2 ) ) ,
228- projection. clone ( )
229- ) . unwrap ( ) ;
230- let duration = start_time. elapsed ( ) ;
231- debug ! ( "Time elapsed in building batch: {:?}" , duration) ;
232- count = 0 ;
233- chroms. clear ( ) ;
234- poss. clear ( ) ;
235- pose. clear ( ) ;
236- ids. clear ( ) ;
237- refs. clear ( ) ;
238- alts. clear ( ) ;
239- quals. clear ( ) ;
240- filters. clear ( ) ;
241- batch_num += 1 ;
242- Some ( Ok ( batch) )
243- } ) ;
244- Ok ( stream:: iter ( iter) )
260+ } ;
261+ Ok ( stream)
245262}
246263
247264
0 commit comments