@@ -754,6 +754,28 @@ pub trait MappingExt {
754754 fn pwritev ( & self , fd : RawFd , offset : i64 ) -> Result < usize > ;
755755}
756756
757+ // Gross hack alert: since the mappings below are memory regions backed by
758+ // segvmm_ops, `zvol_{read,write}` and similar will end up contending on
759+ // `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate
760+ // it, copy from guest memory to Propolis heap. The segment backing Propolis'
761+ // heap has an `as_page{,un}lock` impl that avoids the more
762+ // expensive/contentious `as_fault()` fallback.
763+ //
764+ // This is an optimization until stlouis#871 can get things sorted, at
765+ // which point it should be strictly worse than directly using the
766+ // requested mappings.
767+ //
768+ // Beyond this, either fall back to using iovecs directly (and trust the kernel
769+ // to handle bizarre vecs) or error. This is an especially gross hack because we
770+ // could/should just set MDTS, which more naturally communicates to the guest
771+ // the largest I/O we're willing to handle (and gives an out for too-large I/Os)
772+ //
773+ // The amount of memory used for temporary buffers is given by the number of
774+ // worker threads for all file-backed disks, times this threshold. It works out
775+ // to up to 128 MiB (8 worker threads) of buffers per disk by default as of
776+ // writing.
777+ const MAPPING_IO_LIMIT_BYTES : usize = 16 * crate :: common:: MB ;
778+
757779impl < ' a , T : AsRef < [ SubMapping < ' a > ] > > MappingExt for T {
758780 fn preadv ( & self , fd : RawFd , offset : i64 ) -> Result < usize > {
759781 if !self
@@ -767,23 +789,98 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
767789 ) ) ;
768790 }
769791
770- let iov = self
771- . as_ref ( )
772- . iter ( )
773- . map ( |mapping| iovec {
774- iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
775- iov_len : mapping. len ,
776- } )
777- . collect :: < Vec < _ > > ( ) ;
778-
779- let read = unsafe {
780- libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
781- } ;
782- if read == -1 {
783- return Err ( Error :: last_os_error ( ) ) ;
792+ let mut total_capacity = 0 ;
793+ for mapping in self . as_ref ( ) . iter ( ) {
794+ total_capacity += mapping. len ;
784795 }
785796
786- Ok ( read as usize )
797+ // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
798+ if total_capacity < MAPPING_IO_LIMIT_BYTES {
799+ // If we're motivated to avoid the zero-fill via
800+ // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
801+ // probably avoid this gross hack entirely (see comment on
802+ // MAPPING_IO_LIMIT_BYTES).
803+ let mut buf = Vec :: with_capacity ( total_capacity) ;
804+ // `buf`'s allocation should have `total_capacity` space at this
805+ // point, but using the pointer would be a bit spooky as references
806+ // to any uninitialized elements (i.e. the tail if preadv did a
807+ // short read) is Rust UB.
808+ // Zeroing is "fast" and makes this a bit less fiddly.
809+ buf. resize ( total_capacity, 0 ) ;
810+
811+ let iov = vec ! [ iovec {
812+ iov_base: buf. as_mut_ptr( ) as * mut libc:: c_void,
813+ iov_len: buf. len( ) ,
814+ } ] ;
815+
816+ let read = unsafe {
817+ libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
818+ } ;
819+ if read == -1 {
820+ return Err ( Error :: last_os_error ( ) ) ;
821+ }
822+ let read: usize = read. try_into ( ) . expect ( "read is positive" ) ;
823+
824+ // copy `read` bytes back into the iovecs and return
825+ let mut remaining = read;
826+ let mut offset = 0 ;
827+ for mapping in self . as_ref ( ) . iter ( ) {
828+ let to_copy = std:: cmp:: min ( remaining, mapping. len as usize ) ;
829+ if to_copy == 0 {
830+ break ;
831+ }
832+
833+ // Safety: guest physical memory is READ|WRITE, and won't become
834+ // unmapped as long as we hold a Mapping, which `self` implies.
835+ //
836+ // Sketchy: nothing guarantees the guest is not concurrently
837+ // modifying this memory. Also, nothing prevents the guest from
838+ // submitting the same ranges as part of another I/O where we
839+ // might be doing this same operation on another thread on the
840+ // same ptr/len pair. Because we are just moving u8's, it should
841+ // be OK.
842+ let guest_slice = unsafe {
843+ std:: slice:: from_raw_parts_mut (
844+ mapping. ptr . as_ptr ( ) as * mut u8 ,
845+ mapping. len ,
846+ )
847+ } ;
848+
849+ guest_slice. copy_from_slice ( & buf[ offset..] [ ..to_copy] ) ;
850+
851+ remaining -= to_copy;
852+ offset += to_copy;
853+
854+ if remaining == 0 {
855+ // Either we're at the last iov and we're finished copying
856+ // back into the guest, or `preadv` did a short read.
857+ break ;
858+ }
859+ }
860+
861+ // We should never read more than the guest mappings could hold.
862+ assert_eq ! ( remaining, 0 ) ;
863+
864+ Ok ( read)
865+ } else {
866+ let iov = self
867+ . as_ref ( )
868+ . iter ( )
869+ . map ( |mapping| iovec {
870+ iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
871+ iov_len : mapping. len ,
872+ } )
873+ . collect :: < Vec < _ > > ( ) ;
874+
875+ let read = unsafe {
876+ libc:: preadv ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
877+ } ;
878+ if read == -1 {
879+ return Err ( Error :: last_os_error ( ) ) ;
880+ }
881+ let read: usize = read. try_into ( ) . expect ( "read is positive" ) ;
882+ Ok ( read)
883+ }
787884 }
788885
789886 fn pwritev ( & self , fd : RawFd , offset : i64 ) -> Result < usize > {
@@ -798,18 +895,63 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
798895 ) ) ;
799896 }
800897
801- let iov = self
802- . as_ref ( )
803- . iter ( )
804- . map ( |mapping| iovec {
805- iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
806- iov_len : mapping. len ,
807- } )
808- . collect :: < Vec < _ > > ( ) ;
898+ let mut total_capacity = 0 ;
899+ for mapping in self . as_ref ( ) . iter ( ) {
900+ total_capacity += mapping. len ;
901+ }
809902
810- let written = unsafe {
811- libc:: pwritev ( fd, iov. as_ptr ( ) , iov. len ( ) as libc:: c_int , offset)
903+ // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
904+ let written = if total_capacity < MAPPING_IO_LIMIT_BYTES {
905+ let mut buf = Vec :: with_capacity ( total_capacity) ;
906+ for mapping in self . as_ref ( ) . iter ( ) {
907+ // Safety: these pointer/length pairs are into guest physical
908+ // memory, which the VM has no control over. Having a `{Sub}Mapping`
909+ // is a commitment that the guest memory mapping is valid, and will
910+ // remain so until `pwritev` returns. We're going to immediately
911+ // read from this slice, but it would be just as valid to pass these
912+ // as an iovec directly to pwritev.
913+ let s = unsafe {
914+ std:: slice:: from_raw_parts (
915+ mapping. ptr . as_ptr ( ) as * const u8 ,
916+ mapping. len ,
917+ )
918+ } ;
919+ buf. extend_from_slice ( s) ;
920+ }
921+
922+ let iovs = vec ! [ iovec {
923+ iov_base: buf. as_mut_ptr( ) as * mut libc:: c_void,
924+ iov_len: buf. len( ) ,
925+ } ] ;
926+
927+ unsafe {
928+ libc:: pwritev (
929+ fd,
930+ iovs. as_ptr ( ) ,
931+ iovs. len ( ) as libc:: c_int ,
932+ offset,
933+ )
934+ }
935+ } else {
936+ let iovs = self
937+ . as_ref ( )
938+ . iter ( )
939+ . map ( |mapping| iovec {
940+ iov_base : mapping. ptr . as_ptr ( ) as * mut libc:: c_void ,
941+ iov_len : mapping. len ,
942+ } )
943+ . collect :: < Vec < _ > > ( ) ;
944+
945+ unsafe {
946+ libc:: pwritev (
947+ fd,
948+ iovs. as_ptr ( ) ,
949+ iovs. len ( ) as libc:: c_int ,
950+ offset,
951+ )
952+ }
812953 } ;
954+
813955 if written == -1 {
814956 return Err ( Error :: last_os_error ( ) ) ;
815957 }
0 commit comments