11use std:: fs:: File ;
2+ use std:: io;
23use std:: io:: Error ;
34use std:: num:: NonZero ;
45use std:: sync:: Arc ;
6+ use bytes:: Bytes ;
7+ use futures:: { Stream , StreamExt } ;
8+ use futures:: stream:: BoxStream ;
59use log:: debug;
610use noodles:: { bgzf, vcf} ;
711use noodles:: vcf:: io:: Reader ;
12+ use noodles:: vcf:: Record ;
813use noodles_bgzf:: { AsyncReader , MultithreadedReader } ;
914use opendal:: { FuturesBytesStream , Operator } ;
1015use opendal:: layers:: { LoggingLayer , RetryLayer , TimeoutLayer } ;
1116use opendal:: services:: { Gcs , S3 } ;
17+ use tokio:: io:: { AsyncRead , BufReader } ;
1218use tokio_util:: io:: StreamReader ;
1319
1420
@@ -28,7 +34,7 @@ impl CompressionType {
2834 }
2935
3036 fn from_string ( compression_type : String ) -> Self {
31- match compression_type. as_str ( ) {
37+ match compression_type. to_lowercase ( ) . as_str ( ) {
3238 "gz" => CompressionType :: GZIP ,
3339 "bgz" => CompressionType :: BGZF ,
3440 "none" => CompressionType :: NONE ,
@@ -79,6 +85,9 @@ fn get_file_path(file_path: String) -> String {
7985
8086pub fn get_compression_type ( file_path : String ) -> CompressionType {
8187 //extract the file extension from path
88+ if file_path. to_lowercase ( ) . ends_with ( ".vcf" ) {
89+ return CompressionType :: NONE ;
90+ }
8291 let file_extension = file_path. split ( '.' ) . last ( ) . unwrap ( ) ;
8392 //return the compression type
8493 CompressionType :: from_string ( file_extension. to_string ( ) )
@@ -153,20 +162,108 @@ pub async fn get_remote_stream(file_path: String) -> Result<FuturesBytesStream,
153162 }
154163}
155164
156- pub async fn get_remote_vcf_reader ( file_path : String ) -> vcf:: r#async:: io:: Reader < AsyncReader < StreamReader < FuturesBytesStream , bytes :: Bytes > > > {
165+ pub async fn get_remote_vcf_bgzf_reader ( file_path : String ) -> vcf:: r#async:: io:: Reader < AsyncReader < StreamReader < FuturesBytesStream , Bytes > > > {
157166 let inner = get_remote_stream_bgzf ( file_path. clone ( ) ) . await . unwrap ( ) ;
158167 let mut reader = vcf:: r#async:: io:: Reader :: new ( inner) ;
159168 reader
160169}
161170
171+ pub async fn get_remote_vcf_reader ( file_path : String ) -> vcf:: r#async:: io:: Reader < StreamReader < FuturesBytesStream , Bytes > > {
172+ let inner = StreamReader :: new ( get_remote_stream ( file_path. clone ( ) ) . await . unwrap ( ) ) ;
173+ let mut reader = vcf:: r#async:: io:: Reader :: new ( inner) ;
174+ reader
175+ }
176+
177+
162178
163- pub fn get_local_vcf_reader ( file_path : String , thread_num : usize ) -> Result < Reader < MultithreadedReader < File > > , Error > {
179+
180+ pub fn get_local_vcf_bgzf_reader ( file_path : String , thread_num : usize ) -> Result < Reader < MultithreadedReader < File > > , Error > {
164181 debug ! ( "Reading VCF file from local storage with {} threads" , thread_num) ;
165182 File :: open ( file_path)
166183 . map ( |f| noodles_bgzf:: MultithreadedReader :: with_worker_count ( NonZero :: new ( thread_num) . unwrap ( ) , f) )
167184 . map ( vcf:: io:: Reader :: new)
168185}
169186
170187
188+ pub async fn get_local_vcf_reader ( file_path : String ) -> Result < vcf:: r#async:: io:: Reader < BufReader < tokio:: fs:: File > > , Error > {
189+ debug ! ( "Reading VCF file from local storage with async reader" ) ;
190+ let reader = tokio:: fs:: File :: open ( "sample.vcf" )
191+ . await
192+ . map ( BufReader :: new)
193+ . map ( vcf:: r#async:: io:: Reader :: new) ?;
194+ Ok ( reader)
195+ }
196+
197+
198+ pub async fn get_local_vcf_header ( file_path : String , thread_num : usize ) -> Result < vcf:: Header , Error > {
199+ let compression_type = get_compression_type ( file_path. clone ( ) ) ;
200+ let header = match compression_type {
201+ CompressionType :: BGZF | CompressionType :: GZIP => {
202+ let mut reader = get_local_vcf_bgzf_reader ( file_path, thread_num) ?;
203+ reader. read_header ( ) ?
204+ }
205+ CompressionType :: NONE => {
206+ let mut reader = get_local_vcf_reader ( file_path) . await ?;
207+ reader. read_header ( ) . await ?
208+ }
209+ } ;
210+ Ok ( header)
211+ }
171212
213+ pub async fn get_remote_vcf_header ( file_path : String ) -> Result < vcf:: Header , Error > {
214+ let compression_type = get_compression_type ( file_path. clone ( ) ) ;
215+ let header = match compression_type {
216+ CompressionType :: BGZF | CompressionType :: GZIP => {
217+ let mut reader = get_remote_vcf_bgzf_reader ( file_path) . await ;
218+ reader. read_header ( ) . await ?
219+ }
220+ CompressionType :: NONE => {
221+ let mut reader = get_remote_vcf_reader ( file_path) . await ;
222+ reader. read_header ( ) . await ?
223+ }
224+ } ;
225+ Ok ( header)
226+ }
227+
228+ pub enum VcfRemoteReader {
229+ BGZF ( vcf:: r#async:: io:: Reader < AsyncReader < StreamReader < FuturesBytesStream , Bytes > > > ) ,
230+ PLAIN ( vcf:: r#async:: io:: Reader < StreamReader < FuturesBytesStream , Bytes > > )
231+ }
232+
233+ impl VcfRemoteReader {
234+ pub async fn new ( file_path : String ) -> Self {
235+ let compression_type = get_compression_type ( file_path. clone ( ) ) ;
236+ match compression_type {
237+ CompressionType :: BGZF | CompressionType :: GZIP => {
238+ let reader = get_remote_vcf_bgzf_reader ( file_path) . await ;
239+ VcfRemoteReader :: BGZF ( reader)
240+ }
241+ CompressionType :: NONE => {
242+ let reader = get_remote_vcf_reader ( file_path) . await ;
243+ VcfRemoteReader :: PLAIN ( reader)
244+ }
245+ }
246+ }
247+ pub async fn read_header ( & mut self ) -> Result < vcf:: Header , Error > {
248+ match self {
249+ VcfRemoteReader :: BGZF ( reader) => {
250+ reader. read_header ( ) . await
251+ }
252+ VcfRemoteReader :: PLAIN ( reader) => {
253+ reader. read_header ( ) . await
254+ }
255+ }
256+ }
257+
258+ pub async fn read_records ( & mut self ) -> BoxStream < ' _ , Result < Record , Error > > {
259+ match self {
260+ VcfRemoteReader :: BGZF ( reader) => {
261+ reader. records ( ) . boxed ( )
262+ }
263+ VcfRemoteReader :: PLAIN ( reader) => {
264+ reader. records ( ) . boxed ( )
265+ }
266+ }
267+ }
268+ }
172269
0 commit comments