Skip to content

Commit c1bfd48

Browse files
committed
block/file: do pread/pwrite from Propolis heap instead of VM memory
As described in the comment on `MAPPING_IO_LIMIT_BYTES`, this is a gross hack: if VM memory is used as the iovec for `p{read,write}v` to a block device or zvol, we'll end up contending on `svmd->svmd_lock` pretty heavily in higher IOPS situations. This might arise when doing fewer larger I/Os to zvols or block devices as well, when we'll be forwarding each of the 4k entries in guest PRP lists as a distinct iovec, but we haven't tested that. Either way, operating from Propolis heap avoids the issue for now. One could imagine this "should" be a configurable behavior on file block backends, since many files (like disk images in propolis-standalone) would not need this. The hope is that we can fix this in the kernel so the hack is no longer needed for zvols or block devices, rather than leaning further into properly supporting this copy hack.
1 parent fd36368 commit c1bfd48

File tree

1 file changed

+167
-25
lines changed

1 file changed

+167
-25
lines changed

lib/propolis/src/vmm/mem.rs

Lines changed: 167 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,28 @@ pub trait MappingExt {
754754
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
755755
}
756756

757+
// Gross hack alert: since the mappings below are memory regions backed by
758+
// segvmm_ops, `zvol_{read,write}` and similar will end up contending on
759+
// `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate
760+
// it, copy from guest memory to Propolis heap. The segment backing Propolis'
761+
// heap has an `as_page{,un}lock` impl that avoids the more
762+
// expensive/contentious `as_fault()` fallback.
763+
//
764+
// This is an optimization until stlouis#871 can get things sorted, at
765+
// which point it should be strictly worse than directly using the
766+
// requested mappings.
767+
//
768+
// Beyond this, either fall back to using iovecs directly (and trust the kernel
769+
// to handle bizarre vecs) or error. This is an especially gross hack because we
770+
// could/should just set MDTS, which more naturally communicates to the guest
771+
// the largest I/O we're willing to handle (and gives an out for too-large I/Os)
772+
//
773+
// The amount of memory used for temporary buffers is given by the number of
774+
// worker threads for all file-backed disks, times this threshold. It works out
775+
// to up to 128 MiB (8 worker threads) of buffers per disk by default as of
776+
// writing.
777+
const MAPPING_IO_LIMIT_BYTES: usize = 16 * crate::common::MB;
778+
757779
impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
758780
fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
759781
if !self
@@ -767,23 +789,98 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
767789
));
768790
}
769791

770-
let iov = self
771-
.as_ref()
772-
.iter()
773-
.map(|mapping| iovec {
774-
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
775-
iov_len: mapping.len,
776-
})
777-
.collect::<Vec<_>>();
778-
779-
let read = unsafe {
780-
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
781-
};
782-
if read == -1 {
783-
return Err(Error::last_os_error());
792+
let mut total_capacity = 0;
793+
for mapping in self.as_ref().iter() {
794+
total_capacity += mapping.len;
784795
}
785796

786-
Ok(read as usize)
797+
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
798+
if total_capacity < MAPPING_IO_LIMIT_BYTES {
799+
// If we're motivated to avoid the zero-fill via
800+
// `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
801+
// probably avoid this gross hack entirely (see comment on
802+
// MAPPING_IO_LIMIT_BYTES).
803+
let mut buf = Vec::with_capacity(total_capacity);
804+
// `buf`'s allocation should have `total_capacity` space at this
805+
// point, but using the pointer would be a bit spooky as references
806+
// to any uninitialized elements (i.e. the tail if preadv did a
807+
// short read) is Rust UB.
808+
// Zeroing is "fast" and makes this a bit less fiddly.
809+
buf.resize(total_capacity, 0);
810+
811+
let iov = vec![iovec {
812+
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
813+
iov_len: buf.len(),
814+
}];
815+
816+
let read = unsafe {
817+
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
818+
};
819+
if read == -1 {
820+
return Err(Error::last_os_error());
821+
}
822+
let read: usize = read.try_into().expect("read is positive");
823+
824+
// copy `read` bytes back into the iovecs and return
825+
let mut remaining = read;
826+
let mut offset = 0;
827+
for mapping in self.as_ref().iter() {
828+
let to_copy = std::cmp::min(remaining, mapping.len as usize);
829+
if to_copy == 0 {
830+
break;
831+
}
832+
833+
// Safety: guest physical memory is READ|WRITE, and won't become
834+
// unmapped as long as we hold a Mapping, which `self` implies.
835+
//
836+
// Sketchy: nothing guarantees the guest is not concurrently
837+
// modifying this memory. Also, nothing prevents the guest from
838+
// submitting the same ranges as part of another I/O where we
839+
// might be doing this same operation on another thread on the
840+
// same ptr/len pair. Because we are just moving u8's, it should
841+
// be OK.
842+
let guest_slice = unsafe {
843+
std::slice::from_raw_parts_mut(
844+
mapping.ptr.as_ptr() as *mut u8,
845+
mapping.len,
846+
)
847+
};
848+
849+
guest_slice.copy_from_slice(&buf[offset..][..to_copy]);
850+
851+
remaining -= to_copy;
852+
offset += to_copy;
853+
854+
if remaining == 0 {
855+
// Either we're at the last iov and we're finished copying
856+
// back into the guest, or `preadv` did a short read.
857+
break;
858+
}
859+
}
860+
861+
// We should never read more than the guest mappings could hold.
862+
assert_eq!(remaining, 0);
863+
864+
Ok(read)
865+
} else {
866+
let iov = self
867+
.as_ref()
868+
.iter()
869+
.map(|mapping| iovec {
870+
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
871+
iov_len: mapping.len,
872+
})
873+
.collect::<Vec<_>>();
874+
875+
let read = unsafe {
876+
libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
877+
};
878+
if read == -1 {
879+
return Err(Error::last_os_error());
880+
}
881+
let read: usize = read.try_into().expect("read is positive");
882+
Ok(read)
883+
}
787884
}
788885

789886
fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
@@ -798,18 +895,63 @@ impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
798895
));
799896
}
800897

801-
let iov = self
802-
.as_ref()
803-
.iter()
804-
.map(|mapping| iovec {
805-
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
806-
iov_len: mapping.len,
807-
})
808-
.collect::<Vec<_>>();
898+
let mut total_capacity = 0;
899+
for mapping in self.as_ref().iter() {
900+
total_capacity += mapping.len;
901+
}
809902

810-
let written = unsafe {
811-
libc::pwritev(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
903+
// Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
904+
let written = if total_capacity < MAPPING_IO_LIMIT_BYTES {
905+
let mut buf = Vec::with_capacity(total_capacity);
906+
for mapping in self.as_ref().iter() {
907+
// Safety: these pointer/length pairs are into guest physical
908+
// memory, which the VM has no control over. Having a `{Sub}Mapping`
909+
// is a commitment that the guest memory mapping is valid, and will
910+
// remain so until `pwritev` returns. We're going to immediately
911+
// read from this slice, but it would be just as valid to pass these
912+
// as an iovec directly to pwritev.
913+
let s = unsafe {
914+
std::slice::from_raw_parts(
915+
mapping.ptr.as_ptr() as *const u8,
916+
mapping.len,
917+
)
918+
};
919+
buf.extend_from_slice(s);
920+
}
921+
922+
let iovs = vec![iovec {
923+
iov_base: buf.as_mut_ptr() as *mut libc::c_void,
924+
iov_len: buf.len(),
925+
}];
926+
927+
unsafe {
928+
libc::pwritev(
929+
fd,
930+
iovs.as_ptr(),
931+
iovs.len() as libc::c_int,
932+
offset,
933+
)
934+
}
935+
} else {
936+
let iovs = self
937+
.as_ref()
938+
.iter()
939+
.map(|mapping| iovec {
940+
iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
941+
iov_len: mapping.len,
942+
})
943+
.collect::<Vec<_>>();
944+
945+
unsafe {
946+
libc::pwritev(
947+
fd,
948+
iovs.as_ptr(),
949+
iovs.len() as libc::c_int,
950+
offset,
951+
)
952+
}
812953
};
954+
813955
if written == -1 {
814956
return Err(Error::last_os_error());
815957
}

0 commit comments

Comments
 (0)