diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 35958f16288ff..672118bf5593e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6220,9 +6220,14 @@ rdt= [HW,X86,RDT] Turn on/off individual RDT features. List is: cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, l2cdp, - mba, smba, bmec, abmc. + mba, smba, bmec, abmc, sdciae, energy[:guid], + perf[:guid]. E.g. to turn on cmt and turn off mba use: rdt=cmt,!mba + To turn off all energy telemetry monitoring and ensure that + perf telemetry monitoring associated with guid 0x12345 + is enabled use: + rdt=!energy,perf:0x12345 reboot= [KNL] Format (x86 or x86_64): diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst index 6a012c98bdcd3..189fa760dadea 100644 --- a/Documentation/arch/arm64/index.rst +++ b/Documentation/arch/arm64/index.rst @@ -23,6 +23,7 @@ ARM64 Architecture memory memory-tagging-extension mops + mpam perf pointer-authentication ptdump diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst new file mode 100644 index 0000000000000..570f51a8d4ebf --- /dev/null +++ b/Documentation/arch/arm64/mpam.rst @@ -0,0 +1,72 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==== +MPAM +==== + +What is MPAM +============ +MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory +system components such as the caches or memory controllers that allow memory +traffic to be labelled, partitioned and monitored. + +Traffic is labelled by the CPU, based on the control or monitor group the +current task is assigned to using resctrl. Partitioning policy can be set +using the schemata file in resctrl, and monitor values read via resctrl. +See Documentation/filesystems/resctrl.rst for more details. + +This allows tasks that share memory system resources, such as caches, to be +isolated from each other according to the partitioning policy (so called noisy +neighbours). + +Supported Platforms +=================== +Use of this feature requires CPU support, support in the memory system +components, and a description from firmware of where the MPAM device controls +are in the MMIO address space. (e.g. the 'MPAM' ACPI table). + +The MMIO device that provides MPAM controls/monitors for a memory system +component is called a memory system component. (MSC). + +Because the user interface to MPAM is via resctrl, only MPAM features that are +compatible with resctrl can be exposed to user-space. + +MSC are considered as a group based on the topology. MSC that correspond with +the L3 cache are considered together, it is not possible to mix MSC between L2 +and L3 to 'cover' a resctrl schema. + +The supported features are: + +* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose + CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this + level that also supports the feature. Mismatched big/little platforms are + not supported as resctrl's controls would then also depend on task + placement. + +* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache. + resctrl uses the L3 cache-id to identify where the memory bandwidth + control is applied. For this reason the platform must have an L3 cache + with cache-id's supplied by firmware. (It doesn't need to support MPAM.) + + To be exported as the 'MB' schema, the topology of the group of MSC chosen + must match the topology of the L3 cache so that the cache-id's can be + repainted. For example: Platforms with Memory bandwidth maximum controls + on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these + nodes do not have a corresponding L3 cache. If the memory bandwidth + control is on the memory rather than the L3 then there must be a single + global L3 as otherwise it is unknown which L3 the traffic came from. There + must be no caches between the L3 and the memory so that the two ends of + the path have equivalent traffic. + + When the MPAM driver finds multiple groups of MSC it can use for the 'MB' + schema, it prefers the group closest to the L3 cache. + +* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided + there is at least one CSU monitor on each MSC that makes up the L3 group. + Exposing CSU counters from other caches or devices is not supported. + +Reporting Bugs +============== +If you are not seeing the counters or controls you expect please share the +debug messages produced when enabling dynamic debug and booting with: +dyndbg="file mpam_resctrl.c +pl" diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst index 93cdf16937159..5650c4b6d2c6d 100644 --- a/Documentation/arch/arm64/silicon-errata.rst +++ b/Documentation/arch/arm64/silicon-errata.rst @@ -215,6 +215,9 @@ stable kernels. | ARM | GIC-700 | #2941627 | ARM64_ERRATUM_2941627 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| ARM | CMN-650 | #3642720 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 | +----------------+-----------------+-----------------+-----------------------------+ | Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 | @@ -248,6 +251,12 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ diff --git a/Documentation/driver-api/pci/p2pdma.rst b/Documentation/driver-api/pci/p2pdma.rst index d0b241628cf13..280673b50350b 100644 --- a/Documentation/driver-api/pci/p2pdma.rst +++ b/Documentation/driver-api/pci/p2pdma.rst @@ -9,22 +9,48 @@ between two devices on the bus. This type of transaction is henceforth called Peer-to-Peer (or P2P). However, there are a number of issues that make P2P transactions tricky to do in a perfectly safe way. -One of the biggest issues is that PCI doesn't require forwarding -transactions between hierarchy domains, and in PCIe, each Root Port -defines a separate hierarchy domain. To make things worse, there is no -simple way to determine if a given Root Complex supports this or not. -(See PCIe r4.0, sec 1.3.1). Therefore, as of this writing, the kernel -only supports doing P2P when the endpoints involved are all behind the -same PCI bridge, as such devices are all in the same PCI hierarchy -domain, and the spec guarantees that all transactions within the -hierarchy will be routable, but it does not require routing -between hierarchies. - -The second issue is that to make use of existing interfaces in Linux, -memory that is used for P2P transactions needs to be backed by struct -pages. However, PCI BARs are not typically cache coherent so there are -a few corner case gotchas with these pages so developers need to -be careful about what they do with them. +For PCIe the routing of Transaction Layer Packets (TLPs) is well-defined up +until they reach a host bridge or root port. If the path includes PCIe switches +then based on the ACS settings the transaction can route entirely within +the PCIe hierarchy and never reach the root port. The kernel will evaluate +the PCIe topology and always permit P2P in these well-defined cases. + +However, if the P2P transaction reaches the host bridge then it might have to +hairpin back out the same root port, be routed inside the CPU SOC to another +PCIe root port, or routed internally to the SOC. + +The PCIe specification doesn't define the forwarding of transactions between +hierarchy domains and kernel defaults to blocking such routing. There is an +allow list to allow detecting known-good HW, in which case P2P between any +two PCIe devices will be permitted. + +Since P2P inherently is doing transactions between two devices it requires two +drivers to be co-operating inside the kernel. The providing driver has to convey +its MMIO to the consuming driver. To meet the driver model lifecycle rules the +MMIO must have all DMA mapping removed, all CPU accesses prevented, all page +table mappings undone before the providing driver completes remove(). + +This requires the providing and consuming driver to actively work together to +guarantee that the consuming driver has stopped using the MMIO during a removal +cycle. This is done by either a synchronous invalidation shutdown or waiting +for all usage refcounts to reach zero. + +At the lowest level the P2P subsystem offers a naked struct p2p_provider that +delegates lifecycle management to the providing driver. It is expected that +drivers using this option will wrap their MMIO memory in DMABUF and use DMABUF +to provide an invalidation shutdown. These MMIO addresess have no struct page, and +if used with mmap() must create special PTEs. As such there are very few +kernel uAPIs that can accept pointers to them; in particular they cannot be used +with read()/write(), including O_DIRECT. + +Building on this, the subsystem offers a layer to wrap the MMIO in a ZONE_DEVICE +pgmap of MEMORY_DEVICE_PCI_P2PDMA to create struct pages. The lifecycle of +pgmap ensures that when the pgmap is destroyed all other drivers have stopped +using the MMIO. This option works with O_DIRECT flows, in some cases, if the +underlying subsystem supports handling MEMORY_DEVICE_PCI_P2PDMA through +FOLL_PCI_P2PDMA. The use of FOLL_LONGTERM is prevented. As this relies on pgmap +it also relies on architecture support along with alignment and minimum size +limitations. Driver Writer's Guide @@ -114,14 +140,39 @@ allocating scatter-gather lists with P2P memory. Struct Page Caveats ------------------- -Driver writers should be very careful about not passing these special -struct pages to code that isn't prepared for it. At this time, the kernel -interfaces do not have any checks for ensuring this. This obviously -precludes passing these pages to userspace. +While the MEMORY_DEVICE_PCI_P2PDMA pages can be installed in VMAs, +pin_user_pages() and related will not return them unless FOLL_PCI_P2PDMA is set. -P2P memory is also technically IO memory but should never have any side -effects behind it. Thus, the order of loads and stores should not be important -and ioreadX(), iowriteX() and friends should not be necessary. +The MEMORY_DEVICE_PCI_P2PDMA pages require care to support in the kernel. The +KVA is still MMIO and must still be accessed through the normal +readX()/writeX()/etc helpers. Direct CPU access (e.g. memcpy) is forbidden, just +like any other MMIO mapping. While this will actually work on some +architectures, others will experience corruption or just crash in the kernel. +Supporting FOLL_PCI_P2PDMA in a subsystem requires scrubbing it to ensure no CPU +access happens. + + +Usage With DMABUF +================= + +DMABUF provides an alternative to the above struct page-based +client/provider/orchestrator system and should be used when struct page +doesn't exist. In this mode the exporting driver will wrap +some of its MMIO in a DMABUF and give the DMABUF FD to userspace. + +Userspace can then pass the FD to an importing driver which will ask the +exporting driver to map it to the importer. + +In this case the initiator and target pci_devices are known and the P2P subsystem +is used to determine the mapping type. The phys_addr_t-based DMA API is used to +establish the dma_addr_t. + +Lifecycle is controlled by DMABUF move_notify(). When the exporting driver wants +to remove() it must deliver an invalidation shutdown to all DMABUF importing +drivers through move_notify() and synchronously DMA unmap all the MMIO. + +No importing driver can continue to have a DMA map to the MMIO after the +exporting driver has destroyed its p2p_provider. P2P DMA Support Library diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index b7f35b07876a9..45dde87741288 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -17,17 +17,18 @@ AMD refers to this feature as AMD Platform Quality of Service(AMD QoS). This feature is enabled by the CONFIG_X86_CPU_RESCTRL and the x86 /proc/cpuinfo flag bits: -=============================================== ================================ -RDT (Resource Director Technology) Allocation "rdt_a" -CAT (Cache Allocation Technology) "cat_l3", "cat_l2" -CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2" -CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc" -MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" -MBA (Memory Bandwidth Allocation) "mba" -SMBA (Slow Memory Bandwidth Allocation) "" -BMEC (Bandwidth Monitoring Event Configuration) "" -ABMC (Assignable Bandwidth Monitoring Counters) "" -=============================================== ================================ +=============================================================== ================================ +RDT (Resource Director Technology) Allocation "rdt_a" +CAT (Cache Allocation Technology) "cat_l3", "cat_l2" +CDP (Code and Data Prioritization) "cdp_l3", "cdp_l2" +CQM (Cache QoS Monitoring) "cqm_llc", "cqm_occup_llc" +MBM (Memory Bandwidth Monitoring) "cqm_mbm_total", "cqm_mbm_local" +MBA (Memory Bandwidth Allocation) "mba" +SMBA (Slow Memory Bandwidth Allocation) "" +BMEC (Bandwidth Monitoring Event Configuration) "" +ABMC (Assignable Bandwidth Monitoring Counters) "" +SDCIAE (Smart Data Cache Injection Allocation Enforcement) "" +=============================================================== ================================ Historically, new features were made visible by default in /proc/cpuinfo. This resulted in the feature flags becoming hard to parse by humans. Adding a new @@ -72,6 +73,11 @@ The 'info' directory contains information about the enabled resources. Each resource has its own subdirectory. The subdirectory names reflect the resource names. +Most of the files in the resource's subdirectory are read-only, and +describe properties of the resource. Resources that support global +configuration options also include writable files that can be used +to modify those settings. + Each subdirectory contains the following files with respect to allocation: @@ -90,12 +96,19 @@ related to allocation: must be set when writing a mask. "shareable_bits": - Bitmask of shareable resource with other executing - entities (e.g. I/O). User can use this when - setting up exclusive cache partitions. Note that - some platforms support devices that have their - own settings for cache use which can over-ride - these bits. + Bitmask of shareable resource with other executing entities + (e.g. I/O). Applies to all instances of this resource. User + can use this when setting up exclusive cache partitions. + Note that some platforms support devices that have their + own settings for cache use which can over-ride these bits. + + When "io_alloc" is enabled, a portion of each cache instance can + be configured for shared use between hardware and software. + "bit_usage" should be used to see which portions of each cache + instance is configured for hardware use via "io_alloc" feature + because every cache instance can have its "io_alloc" bitmask + configured independently via "io_alloc_cbm". + "bit_usage": Annotated capacity bitmasks showing how all instances of the resource are used. The legend is: @@ -109,16 +122,16 @@ related to allocation: "H": Corresponding region is used by hardware only but available for software use. If a resource - has bits set in "shareable_bits" but not all - of these bits appear in the resource groups' - schematas then the bits appearing in - "shareable_bits" but no resource group will - be marked as "H". + has bits set in "shareable_bits" or "io_alloc_cbm" + but not all of these bits appear in the resource + groups' schemata then the bits appearing in + "shareable_bits" or "io_alloc_cbm" but no + resource group will be marked as "H". "X": Corresponding region is available for sharing and - used by hardware and software. These are the - bits that appear in "shareable_bits" as - well as a resource group's allocation. + used by hardware and software. These are the bits + that appear in "shareable_bits" or "io_alloc_cbm" + as well as a resource group's allocation. "S": Corresponding region is used by software and available for sharing. @@ -136,6 +149,77 @@ related to allocation: "1": Non-contiguous 1s value in CBM is supported. +"io_alloc": + "io_alloc" enables system software to configure the portion of + the cache allocated for I/O traffic. File may only exist if the + system supports this feature on some of its cache resources. + + "disabled": + Resource supports "io_alloc" but the feature is disabled. + Portions of cache used for allocation of I/O traffic cannot + be configured. + "enabled": + Portions of cache used for allocation of I/O traffic + can be configured using "io_alloc_cbm". + "not supported": + Support not available for this resource. + + The feature can be modified by writing to the interface, for example: + + To enable:: + + # echo 1 > /sys/fs/resctrl/info/L3/io_alloc + + To disable:: + + # echo 0 > /sys/fs/resctrl/info/L3/io_alloc + + The underlying implementation may reduce resources available to + general (CPU) cache allocation. See architecture specific notes + below. Depending on usage requirements the feature can be enabled + or disabled. + + On AMD systems, io_alloc feature is supported by the L3 Smart + Data Cache Injection Allocation Enforcement (SDCIAE). The CLOSID for + io_alloc is the highest CLOSID supported by the resource. When + io_alloc is enabled, the highest CLOSID is dedicated to io_alloc and + no longer available for general (CPU) cache allocation. When CDP is + enabled, io_alloc routes I/O traffic using the highest CLOSID allocated + for the instruction cache (CDP_CODE), making this CLOSID no longer + available for general (CPU) cache allocation for both the CDP_CODE + and CDP_DATA resources. + +"io_alloc_cbm": + Capacity bitmasks that describe the portions of cache instances to + which I/O traffic from supported I/O devices are routed when "io_alloc" + is enabled. + + CBMs are displayed in the following format: + + =;=;... + + Example:: + + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=ffff;1=ffff + + CBMs can be configured by writing to the interface. + + Example:: + + # echo 1=ff > /sys/fs/resctrl/info/L3/io_alloc_cbm + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=ffff;1=00ff + + # echo "0=ff;1=f" > /sys/fs/resctrl/info/L3/io_alloc_cbm + # cat /sys/fs/resctrl/info/L3/io_alloc_cbm + 0=00ff;1=000f + + When CDP is enabled "io_alloc_cbm" associated with the CDP_DATA and CDP_CODE + resources may reflect the same values. For example, values read from and + written to /sys/fs/resctrl/info/L3DATA/io_alloc_cbm may be reflected by + /sys/fs/resctrl/info/L3CODE/io_alloc_cbm and vice versa. + Memory bandwidth(MB) subdirectory contains the following files with respect to allocation: @@ -168,13 +252,12 @@ with respect to allocation: bandwidth percentages are directly applied to the threads running on the core -If RDT monitoring is available there will be an "L3_MON" directory +If L3 monitoring is available there will be an "L3_MON" directory with the following files: "num_rmids": - The number of RMIDs available. This is the - upper bound for how many "CTRL_MON" + "MON" - groups can be created. + The number of RMIDs supported by hardware for + L3 monitoring events. "mon_features": Lists the monitoring events if @@ -400,6 +483,24 @@ with the following files: bytes) at which a previously used LLC_occupancy counter can be considered for re-use. +If telemetry monitoring is available there will be a "PERF_PKG_MON" directory +with the following files: + +"num_rmids": + The number of RMIDs for telemetry monitoring events. + + On Intel resctrl will not enable telemetry events if the number of + RMIDs that can be tracked concurrently is lower than the total number + of RMIDs supported. Telemetry events can be force-enabled with the + "rdt=" kernel parameter, but this may reduce the number of + monitoring groups that can be created. + +"mon_features": + Lists the telemetry monitoring events that are enabled on this system. + +The upper bound for how many "CTRL_MON" + "MON" can be created +is the smaller of the L3_MON and PERF_PKG_MON "num_rmids" values. + Finally, in the top level of the "info" directory there is a file named "last_cmd_status". This is reset with every "command" issued via the file system (making new directories or writing to any of the @@ -505,15 +606,40 @@ When control is enabled all CTRL_MON groups will also contain: When monitoring is enabled all MON groups will also contain: "mon_data": - This contains a set of files organized by L3 domain and by - RDT event. E.g. on a system with two L3 domains there will - be subdirectories "mon_L3_00" and "mon_L3_01". Each of these - directories have one file per event (e.g. "llc_occupancy", - "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these - files provide a read out of the current value of the event for - all tasks in the group. In CTRL_MON groups these files provide - the sum for all tasks in the CTRL_MON group and all tasks in + This contains directories for each monitor domain. + + If L3 monitoring is enabled, there will be a "mon_L3_XX" directory for + each instance of an L3 cache. Each directory contains files for the enabled + L3 events (e.g. "llc_occupancy", "mbm_total_bytes", and "mbm_local_bytes"). + + If telemetry monitoring is enabled, there will be a "mon_PERF_PKG_YY" + directory for each physical processor package. Each directory contains + files for the enabled telemetry events (e.g. "core_energy". "activity", + "uops_retired", etc.) + + The info/`*`/mon_features files provide the full list of enabled + event/file names. + + "core energy" reports a floating point number for the energy (in Joules) + consumed by cores (registers, arithmetic units, TLB and L1/L2 caches) + during execution of instructions summed across all logical CPUs on a + package for the current monitoring group. + + "activity" also reports a floating point value (in Farads). This provides + an estimate of work done independent of the frequency that the CPUs used + for execution. + + Note that "core energy" and "activity" only measure energy/activity in the + "core" of the CPU (arithmetic units, TLB, L1 and L2 caches, etc.). They + do not include L3 cache, memory, I/O devices etc. + + All other events report decimal integer values. + + In a MON group these files provide a read out of the current value of + the event for all tasks in the group. In CTRL_MON groups these files + provide the sum for all tasks in the CTRL_MON group and all tasks in MON groups. Please see example section for more details on usage. + On systems with Sub-NUMA Cluster (SNC) enabled there are extra directories for each node (located within the "mon_L3_XX" directory for the L3 cache they occupy). These are named "mon_sub_L3_YY" diff --git a/MAINTAINERS b/MAINTAINERS index 554e881b05bea..ed15552a77c0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11558,6 +11558,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c @@ -17468,6 +17469,16 @@ S: Maintained F: Documentation/devicetree/bindings/leds/backlight/mps,mp3309c.yaml F: drivers/video/backlight/mp3309c.c +MPAM DRIVER +M: James Morse +M: Ben Horgan +R: Reinette Chatre +R: Fenghua Yu +S: Maintained +F: drivers/resctrl/mpam_* +F: drivers/resctrl/test_mpam_* +F: include/linux/arm_mpam.h + MPS MP2869 DRIVER M: Wensheng Wang L: linux-hwmon@vger.kernel.org diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cdcf3a0fafd1a..a87305c21ad49 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2065,6 +2065,33 @@ config ARM64_TLB_RANGE ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a range of input addresses. +config ARM64_MPAM + bool "Enable support for MPAM" + select ARM64_MPAM_DRIVER + select ARCH_HAS_CPU_RESCTRL + help + Memory System Resource Partitioning and Monitoring (MPAM) is an + optional extension to the Arm architecture that allows each + transaction issued to the memory system to be labelled with a + Partition identifier (PARTID) and Performance Monitoring Group + identifier (PMG). + + Memory system components, such as the caches, can be configured with + policies to control how much of various physical resources (such as + memory bandwidth or cache memory) the transactions labelled with each + PARTID can consume. Depending on the capabilities of the hardware, + the PARTID and PMG can also be used as filtering criteria to measure + the memory system resource consumption of different parts of a + workload. + + Use of this extension requires CPU support, support in the + Memory System Components (MSC), and a description from firmware + of where the MSCs are in the address space. + + MPAM is exposed to user-space via the resctrl pseudo filesystem. + + This option enables the extra context switch code. + endmenu # "ARMv8.4 architectural features" menu "ARMv8.5 architectural features" diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 9dbd0da8eee8c..42cebf1dac4a7 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -514,7 +514,8 @@ check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 .Linit_mpam_\@: - msr_s SYS_MPAM2_EL2, xzr // use the default partition + mov x0, #MPAM2_EL2_EnMPAMSM_MASK + msr_s SYS_MPAM2_EL2, x0 // use the default partition, // and disable lower traps mrs_s x0, SYS_MPAMIDR_EL1 tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h new file mode 100644 index 0000000000000..70d396e7b6da8 --- /dev/null +++ b/arch/arm64/include/asm/mpam.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __ASM__MPAM_H +#define __ASM__MPAM_H + +#include +#include +#include +#include +#include + +#include + +DECLARE_STATIC_KEY_FALSE(mpam_enabled); +DECLARE_PER_CPU(u64, arm64_mpam_default); +DECLARE_PER_CPU(u64, arm64_mpam_current); + +/* + * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group. + * This is used by the context switch code to use the resctrl CPU property + * instead. The value is modified when CDP is enabled/disabled by mounting + * the resctrl filesystem. + */ +extern u64 arm64_mpam_global_default; + +#ifdef CONFIG_ARM64_MPAM +static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i) +{ + return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) | + FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) | + FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) | + FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i); +} + +static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val); +} + +/* + * The resctrl filesystem writes to the partid/pmg values for threads and CPUs, + * which may race with reads in mpam_thread_switch(). Ensure only one of the old + * or new values are used. Particular care should be taken with the pmg field as + * mpam_thread_switch() may read a partid and pmg that don't match, causing this + * value to be stored with cache allocations, despite being considered 'free' by + * resctrl. + */ +static inline u64 mpam_get_regval(struct task_struct *tsk) +{ + return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg); +} + +static inline void mpam_set_task_partid_pmg(struct task_struct *tsk, + u16 partid_d, u16 partid_i, + u8 pmg_d, u8 pmg_i) +{ + u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i); + + WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval); +} + +static inline void mpam_thread_switch(struct task_struct *tsk) +{ + u64 oldregval; + int cpu = smp_processor_id(); + u64 regval = mpam_get_regval(tsk); + + if (!static_branch_likely(&mpam_enabled)) + return; + + if (regval == READ_ONCE(arm64_mpam_global_default)) + regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu)); + + oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + if (oldregval == regval) + return; + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); + + WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval); +} +#else +static inline void mpam_thread_switch(struct task_struct *tsk) {} +#endif /* CONFIG_ARM64_MPAM */ + +#endif /* __ASM__MPAM_H */ diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h new file mode 100644 index 0000000000000..b506e95cf6e37 --- /dev/null +++ b/arch/arm64/include/asm/resctrl.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index f241b8601ebd9..c226dabd50191 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -41,6 +41,9 @@ struct thread_info { #ifdef CONFIG_SHADOW_CALL_STACK void *scs_base; void *scs_sp; +#endif +#ifdef CONFIG_ARM64_MPAM + u64 mpam_partid_pmg; #endif u32 cpu; }; diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 76f32e424065e..15979f3665196 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -67,6 +67,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o +obj-$(CONFIG_ARM64_MPAM) += mpam.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 5b26739af16b8..e42c66d60aa5b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include @@ -2452,13 +2453,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope) static void cpu_enable_mpam(const struct arm64_cpu_capabilities *entry) { - /* - * Access by the kernel (at EL1) should use the reserved PARTID - * which is configured unrestricted. This avoids priority-inversion - * where latency sensitive tasks have to wait for a task that has - * been throttled to release the lock. - */ - write_sysreg_s(0, SYS_MPAM1_EL1); + int cpu = smp_processor_id(); + u64 regval = 0; + + if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled)) + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (cpus_have_cap(ARM64_SME)) + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1); + isb(); + + /* Synchronising the EL0 write is left until the ERET to EL0 */ + write_sysreg_s(regval, SYS_MPAM0_EL1); } static bool diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c new file mode 100644 index 0000000000000..3a490de4fa125 --- /dev/null +++ b/arch/arm64/kernel/mpam.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Arm Ltd. */ + +#include + +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(mpam_enabled); +DEFINE_PER_CPU(u64, arm64_mpam_default); +DEFINE_PER_CPU(u64, arm64_mpam_current); + +u64 arm64_mpam_global_default; + +static int mpam_pm_notifier(struct notifier_block *self, + unsigned long cmd, void *v) +{ + u64 regval; + int cpu = smp_processor_id(); + + switch (cmd) { + case CPU_PM_EXIT: + /* + * Don't use mpam_thread_switch() as the system register + * value has changed under our feet. + */ + regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu)); + write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1); + if (system_supports_sme()) { + write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), + SYS_MPAMSM_EL1); + } + isb(); + + write_sysreg_s(regval, SYS_MPAM0_EL1); + + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block mpam_pm_nb = { + .notifier_call = mpam_pm_notifier, +}; + +static int __init arm64_mpam_register_cpus(void) +{ + u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1); + u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr); + u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr); + + if (!system_supports_mpam()) + return 0; + + cpu_pm_register_notifier(&mpam_pm_nb); + return mpam_register_requestor(partid_max, pmg_max); +} +/* Must occur before mpam_msc_driver_init() from subsys_initcall() */ +arch_initcall(arm64_mpam_register_cpus) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4c328b7c79ba3..d9994a46fe79f 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -774,6 +775,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (prev->thread.sctlr_user != next->thread.sctlr_user) update_sctlr_el1(next->thread.sctlr_user); + /* + * MPAM thread switch happens after the DSB to ensure prev's accesses + * use prev's MPAM settings. + */ + mpam_thread_switch(next); + /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index c5d5e5b86eaf0..63195275a8b82 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -269,7 +269,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu) static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) { - u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; + u64 clr = MPAM2_EL2_EnMPAMSM; + u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1; if (!system_supports_mpam()) return; @@ -279,18 +280,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu) write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2); } else { /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */ - r |= MPAM2_EL2_TIDR; + set |= MPAM2_EL2_TIDR; } - write_sysreg_s(r, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); } static inline void __deactivate_traps_mpam(void) { + u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR; + u64 set = MPAM2_EL2_EnMPAMSM; + if (!system_supports_mpam()) return; - write_sysreg_s(0, SYS_MPAM2_EL2); + sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set); if (system_supports_mpam_hcr()) write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index f28c6cf4fe1be..7ba3240e9c3ff 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt) } NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe); +/* + * The _EL0 value was written by the host's context switch and belongs to the + * VMM. Copy this into the guest's _EL1 register. + */ +static inline void __mpam_guest_load(void) +{ + u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I; + + if (system_supports_mpam()) { + u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN; + + write_sysreg_el1(val, SYS_MPAM1); + } +} + /** * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU * @@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) */ __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); + __mpam_guest_load(); if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 8d7309646ea64..9b63356eb12aa 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -3367,6 +3367,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_MPAM1_EL1), undef_access }, { SYS_DESC(SYS_MPAM0_EL1), undef_access }, + { SYS_DESC(SYS_MPAMSM_EL1), undef_access }, + { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 }, { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 }, diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 1c6cdf9d54bba..43c2759eb640b 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -5031,6 +5031,14 @@ Field 31:16 PARTID_D Field 15:0 PARTID_I EndSysreg +Sysreg MPAMSM_EL1 3 0 10 5 3 +Res0 63:48 +Field 47:40 PMG_D +Res0 39:32 +Field 31:16 PARTID_D +Res0 15:0 +EndSysreg + Sysreg ISR_EL1 3 0 12 1 0 Res0 63:11 Field 10 IS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8623c4a5ce7b5..97a660b6a65d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -538,6 +538,19 @@ config X86_CPU_RESCTRL Say N if unsure. +config X86_CPU_RESCTRL_INTEL_AET + bool "Intel Application Energy Telemetry" + depends on X86_64 && X86_CPU_RESCTRL && CPU_SUP_INTEL && INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y + help + Enable per-RMID telemetry events in resctrl. + + Intel feature that collects per-RMID execution data + about energy consumption, measure of frequency independent + activity and other performance metrics. Data is aggregated + per package. + + Say N if unsure. + config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 4091a776e37aa..2abee3e9dc753 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -500,6 +500,8 @@ #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ +#define X86_FEATURE_SDCIAE (21*32+18) /* L3 Smart Data Cache Injection Allocation Enforcement */ + /* * BUG word(s) */ diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 46efcbd6afa41..87e78586395b1 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -72,6 +72,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_TOTAL }, { X86_FEATURE_BMEC, X86_FEATURE_CQM_MBM_LOCAL }, + { X86_FEATURE_SDCIAE, X86_FEATURE_CAT_L3 }, { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW }, { X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES }, diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index d8a04b195da21..273ddfa308366 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET) += intel_aet.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o # To allow define_trace.h's recursive include: diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 79a1f2d5f5019..7667cf7c4e945 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -100,14 +100,33 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, + [RDT_RESOURCE_PERF_PKG] = + { + .r_resctrl = { + .name = "PERF_PKG", + .mon_scope = RESCTRL_PACKAGE, + .mon_domains = mon_domain_init(RDT_RESOURCE_PERF_PKG), + }, + }, }; +/** + * resctrl_arch_system_num_rmid_idx - Compute number of supported RMIDs + * (minimum across all mon_capable resource) + * + * Return: Number of supported RMIDs at time of call. Note that mount time + * enumeration of resources may reduce the number. + */ u32 resctrl_arch_system_num_rmid_idx(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + u32 num_rmids = U32_MAX; + struct rdt_resource *r; + + for_each_mon_capable_rdt_resource(r) + num_rmids = min(num_rmids, r->mon.num_rmid); /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->mon.num_rmid; + return num_rmids == U32_MAX ? 0 : num_rmids; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -274,6 +293,11 @@ static void rdt_get_cdp_config(int level) rdt_resources_all[level].r_resctrl.cdp_capable = true; } +static void rdt_set_io_alloc_capable(struct rdt_resource *r) +{ + r->cache.io_alloc_capable = true; +} + static void rdt_get_cdp_l3_config(void) { rdt_get_cdp_config(RDT_RESOURCE_L3); @@ -363,7 +387,7 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) kfree(hw_dom); } -static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) +static void l3_mon_domain_free(struct rdt_hw_l3_mon_domain *hw_dom) { int idx; @@ -396,11 +420,13 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * } /** - * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters + * l3_mon_domain_mbm_alloc() - Allocate arch private storage for the MBM counters * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays + * + * Return: 0 for success, or -ENOMEM. */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) +static int l3_mon_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_l3_mon_domain *hw_dom) { size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); enum resctrl_event_id eventid; @@ -433,6 +459,8 @@ static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) return get_cpu_cacheinfo_id(cpu, scope); case RESCTRL_L3_NODE: return cpu_to_node(cpu); + case RESCTRL_PACKAGE: + return topology_physical_package_id(cpu); default: break; } @@ -459,7 +487,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); @@ -476,6 +504,7 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_CTRL_DOMAIN; + d->hdr.rid = r->rid; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); @@ -495,37 +524,13 @@ static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) } } -static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +static void l3_mon_domain_setup(int cpu, int id, struct rdt_resource *r, struct list_head *add_pos) { - int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct list_head *add_pos = NULL; - struct rdt_hw_mon_domain *hw_dom; - struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct cacheinfo *ci; int err; - lockdep_assert_held(&domain_list_lock); - - if (id < 0) { - pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", - cpu, r->mon_scope, r->name); - return; - } - - hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); - if (hdr) { - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) - return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - - cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - return; - } - hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); if (!hw_dom) return; @@ -533,33 +538,66 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_L3; ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); return; } d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); - /* Update the mbm_assign_mode state for the CPU if supported */ - if (r->mon.mbm_cntr_assignable) - resctrl_arch_mbm_cntr_assign_set_one(r); - arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { - mon_domain_free(hw_dom); + if (l3_mon_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { + l3_mon_domain_free(hw_dom); return; } list_add_tail_rcu(&d->hdr.list, add_pos); - err = resctrl_online_mon_domain(r, d); + err = resctrl_online_mon_domain(r, &d->hdr); if (err) { list_del_rcu(&d->hdr.list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_domain_hdr *hdr; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) + cpumask_set_cpu(cpu, &hdr->cpu_mask); + + switch (r->rid) { + case RDT_RESOURCE_L3: + /* Update the mbm_assign_mode state for the CPU if supported */ + if (r->mon.mbm_cntr_assignable) + resctrl_arch_mbm_cntr_assign_set_one(r); + if (!hdr) + l3_mon_domain_setup(cpu, id, r, add_pos); + break; + case RDT_RESOURCE_PERF_PKG: + if (!hdr) + intel_aet_mon_domain_setup(cpu, id, r, add_pos); + break; + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -593,36 +631,33 @@ static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) + return; + + if (!domain_header_is_valid(hdr, RESCTRL_CTRL_DOMAIN, r->rid)) return; d = container_of(hdr, struct rdt_ctrl_domain, hdr); hw_dom = resctrl_to_arch_ctrl_dom(d); - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_ctrl_domain(r, d); - list_del_rcu(&d->hdr.list); - synchronize_rcu(); - - /* - * rdt_ctrl_domain "d" is going to be freed below, so clear - * its pointer from pseudo_lock_region struct. - */ - if (d->plr) - d->plr->d = NULL; - ctrl_domain_free(hw_dom); + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&hdr->list); + synchronize_rcu(); - return; - } + /* + * rdt_ctrl_domain "d" is going to be freed below, so clear + * its pointer from pseudo_lock_region struct. + */ + if (d->plr) + d->plr->d = NULL; + ctrl_domain_free(hw_dom); } static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) { int id = get_domain_id_from_scope(cpu, r->mon_scope); - struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; - struct rdt_mon_domain *d; lockdep_assert_held(&domain_list_lock); @@ -639,20 +674,42 @@ static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) return; } - if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (!cpumask_empty(&hdr->cpu_mask)) return; - d = container_of(hdr, struct rdt_mon_domain, hdr); - hw_dom = resctrl_to_arch_mon_dom(d); + switch (r->rid) { + case RDT_RESOURCE_L3: { + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; - cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); - if (cpumask_empty(&d->hdr.cpu_mask)) { - resctrl_offline_mon_domain(r, d); - list_del_rcu(&d->hdr.list); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); synchronize_rcu(); - mon_domain_free(hw_dom); + l3_mon_domain_free(hw_dom); + break; + } + case RDT_RESOURCE_PERF_PKG: { + struct rdt_perf_pkg_mon_domain *pkgd; - return; + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_PERF_PKG)) + return; + + pkgd = container_of(hdr, struct rdt_perf_pkg_mon_domain, hdr); + resctrl_offline_mon_domain(r, hdr); + list_del_rcu(&hdr->list); + synchronize_rcu(); + kfree(pkgd); + break; + } + default: + pr_warn_once("Unknown resource rid=%d\n", r->rid); + break; } } @@ -707,6 +764,28 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } +void resctrl_arch_pre_mount(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int cpu; + + if (!intel_aet_get_events()) + return; + + /* + * Late discovery of telemetry events means the domains for the + * resource were not built. Do that now. + */ + cpus_read_lock(); + mutex_lock(&domain_list_lock); + r->mon_capable = true; + rdt_mon_capable = true; + for_each_online_cpu(cpu) + domain_add_cpu_mon(cpu, r); + mutex_unlock(&domain_list_lock); + cpus_read_unlock(); +} + enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -719,6 +798,7 @@ enum { RDT_FLAG_SMBA, RDT_FLAG_BMEC, RDT_FLAG_ABMC, + RDT_FLAG_SDCIAE, }; #define RDT_OPT(idx, n, f) \ @@ -745,6 +825,7 @@ static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_SMBA, "smba", X86_FEATURE_SMBA), RDT_OPT(RDT_FLAG_BMEC, "bmec", X86_FEATURE_BMEC), RDT_OPT(RDT_FLAG_ABMC, "abmc", X86_FEATURE_ABMC), + RDT_OPT(RDT_FLAG_SDCIAE, "sdciae", X86_FEATURE_SDCIAE), }; #define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) @@ -760,6 +841,8 @@ static int __init set_rdt_options(char *str) force_off = *tok == '!'; if (force_off) tok++; + if (intel_handle_aet_option(force_off, tok)) + continue; for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { if (strcmp(tok, o->name) == 0) { if (force_off) @@ -854,6 +937,8 @@ static __init bool get_rdt_alloc_resources(void) rdt_get_cache_alloc_cfg(1, r); if (rdt_cpu_has(X86_FEATURE_CDP_L3)) rdt_get_cdp_l3_config(); + if (rdt_cpu_has(X86_FEATURE_SDCIAE)) + rdt_set_io_alloc_capable(r); ret = true; } if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { @@ -880,15 +965,15 @@ static __init bool get_rdt_mon_resources(void) bool ret = false; if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { - resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0, NULL); ret = true; } if (rdt_cpu_has(X86_FEATURE_ABMC)) @@ -897,7 +982,7 @@ static __init bool get_rdt_mon_resources(void) if (!ret) return false; - return !rdt_get_mon_l3_config(r); + return !rdt_get_l3_mon_config(r); } static __init void __check_quirks_intel(void) @@ -1075,6 +1160,8 @@ late_initcall(resctrl_arch_late_init); static void __exit resctrl_arch_exit(void) { + intel_aet_exit(); + cpuhp_remove_state(rdt_online); resctrl_exit(); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 1189c0df4ad76..b20e705606b8f 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -91,3 +91,43 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->sdciae_enabled; +} + +static void resctrl_sdciae_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, SDCIAE_ENABLE_BIT); +} + +static void _resctrl_sdciae_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_ctrl_domain *d; + + /* Walking r->ctrl_domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* Update MSR_IA32_L3_QOS_EXT_CFG MSR on all the CPUs in all domains */ + list_for_each_entry(d, &r->ctrl_domains, hdr.list) + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_sdciae_set_one_amd, &enable, 1); +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (hw_res->r_resctrl.cache.io_alloc_capable && + hw_res->sdciae_enabled != enable) { + _resctrl_sdciae_enable(r, enable); + hw_res->sdciae_enabled = enable; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c new file mode 100644 index 0000000000000..89b8b619d5d53 --- /dev/null +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Intel Application Energy Telemetry + * + * Copyright (C) 2025 Intel Corporation + * + * Author: + * Tony Luck + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * struct pmt_event - Telemetry event. + * @id: Resctrl event id. + * @idx: Counter index within each per-RMID block of counters. + * @bin_bits: Zero for integer valued events, else number bits in fraction + * part of fixed-point. + */ +struct pmt_event { + enum resctrl_event_id id; + unsigned int idx; + unsigned int bin_bits; +}; + +#define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits } + +/** + * struct event_group - Events with the same feature type ("energy" or "perf") and GUID. + * @pfname: PMT feature name ("energy" or "perf") of this event group. + * Used by boot rdt= option. + * @pfg: Points to the aggregated telemetry space information + * returned by the intel_pmt_get_regions_by_feature() + * call to the INTEL_PMT_TELEMETRY driver that contains + * data for all telemetry regions of type @pfname. + * Valid if the system supports the event group, + * NULL otherwise. + * @force_off: True when "rdt" command line or architecture code disables + * this event group due to insufficient RMIDs. + * @force_on: True when "rdt" command line overrides disable of this + * event group. + * @guid: Unique number per XML description file. + * @num_rmid: Number of RMIDs supported by this group. May be + * adjusted downwards if enumeration from + * intel_pmt_get_regions_by_feature() indicates fewer + * RMIDs can be tracked simultaneously. + * @mmio_size: Number of bytes of MMIO registers for this group. + * @num_events: Number of events in this group. + * @evts: Array of event descriptors. + */ +struct event_group { + /* Data fields for additional structures to manage this group. */ + const char *pfname; + struct pmt_feature_group *pfg; + bool force_off, force_on; + + /* Remaining fields initialized from XML file. */ + u32 guid; + u32 num_rmid; + size_t mmio_size; + unsigned int num_events; + struct pmt_event evts[] __counted_by(num_events); +}; + +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \ + (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64)) + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml + */ +static struct event_group energy_0x26696143 = { + .pfname = "energy", + .guid = 0x26696143, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 2, 3), + .num_events = 2, + .evts = { + EVT(PMT_EVENT_ENERGY, 0, 18), + EVT(PMT_EVENT_ACTIVITY, 1, 18), + } +}; + +/* + * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml + */ +static struct event_group perf_0x26557651 = { + .pfname = "perf", + .guid = 0x26557651, + .num_rmid = 576, + .mmio_size = XML_MMIO_SIZE(576, 7, 3), + .num_events = 7, + .evts = { + EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0), + EVT(PMT_EVENT_C1_RES, 1, 0), + EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0), + EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0), + EVT(PMT_EVENT_AUTO_C6_RES, 4, 0), + EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0), + EVT(PMT_EVENT_UOPS_RETIRED, 6, 0), + } +}; + +static struct event_group *known_event_groups[] = { + &energy_0x26696143, + &perf_0x26557651, +}; + +#define for_each_event_group(_peg) \ + for (_peg = known_event_groups; \ + _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \ + _peg++) + +bool intel_handle_aet_option(bool force_off, char *tok) +{ + struct event_group **peg; + bool ret = false; + u32 guid = 0; + char *name; + + if (!tok) + return false; + + name = strsep(&tok, ":"); + if (tok && kstrtou32(tok, 16, &guid)) + return false; + + for_each_event_group(peg) { + if (strcmp(name, (*peg)->pfname)) + continue; + if (guid && (*peg)->guid != guid) + continue; + if (force_off) + (*peg)->force_off = true; + else + (*peg)->force_on = true; + ret = true; + } + + return ret; +} + +static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e) +{ + if (tr->guid != e->guid) + return true; + if (tr->plat_info.package_id >= topology_max_packages()) { + pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id, + tr->guid); + return true; + } + if (tr->size != e->mmio_size) { + pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n", + tr->size, e->guid, e->mmio_size); + return true; + } + + return false; +} + +static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p) +{ + bool usable_regions = false; + + for (int i = 0; i < p->count; i++) { + if (skip_telem_region(&p->regions[i], e)) { + /* + * Clear the address field of regions that did not pass the checks in + * skip_telem_region() so they will not be used by intel_aet_read_event(). + * This is safe to do because intel_pmt_get_regions_by_feature() allocates + * a new pmt_feature_group structure to return to each caller and only makes + * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group() + * returns the structure. + */ + p->regions[i].addr = NULL; + + continue; + } + usable_regions = true; + } + + return usable_regions; +} + +static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p) +{ + struct telemetry_region *tr; + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + tr = &p->regions[i]; + if (tr->num_rmids < e->num_rmid) { + e->force_off = true; + return false; + } + } + + return true; +} + +static bool enable_events(struct event_group *e, struct pmt_feature_group *p) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl; + int skipped_events = 0; + + if (e->force_off) + return false; + + if (!group_has_usable_regions(e, p)) + return false; + + /* + * Only enable event group with insufficient RMIDs if the user requested + * it from the kernel command line. + */ + if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) { + pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n", + r->name, e->pfname, e->guid); + return false; + } + + for (int i = 0; i < p->count; i++) { + if (!p->regions[i].addr) + continue; + /* + * e->num_rmid only adjusted lower if user (via rdt= kernel + * parameter) forces an event group with insufficient RMID + * to be enabled. + */ + e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids); + } + + for (int j = 0; j < e->num_events; j++) { + if (!resctrl_enable_mon_event(e->evts[j].id, true, + e->evts[j].bin_bits, &e->evts[j])) + skipped_events++; + } + if (e->num_events == skipped_events) { + pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid); + return false; + } + + if (r->mon.num_rmid) + r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid); + else + r->mon.num_rmid = e->num_rmid; + + if (skipped_events) + pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name, + e->pfname, e->guid, skipped_events); + else + pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid); + + return true; +} + +static enum pmt_feature_id lookup_pfid(const char *pfname) +{ + if (!strcmp(pfname, "energy")) + return FEATURE_PER_RMID_ENERGY_TELEM; + else if (!strcmp(pfname, "perf")) + return FEATURE_PER_RMID_PERF_TELEM; + + pr_warn("Unknown PMT feature name '%s'\n", pfname); + + return FEATURE_INVALID; +} + +/* + * Request a copy of struct pmt_feature_group for each event group. If there is + * one, the returned structure has an array of telemetry_region structures, + * each element of the array describes one telemetry aggregator. The + * telemetry aggregators may have different GUIDs so obtain duplicate struct + * pmt_feature_group for event groups with same feature type but different + * GUID. Post-processing ensures an event group can only use the telemetry + * aggregators that match its GUID. An event group keeps a pointer to its + * struct pmt_feature_group to indicate that its events are successfully + * enabled. + */ +bool intel_aet_get_events(void) +{ + struct pmt_feature_group *p; + enum pmt_feature_id pfid; + struct event_group **peg; + bool ret = false; + + for_each_event_group(peg) { + pfid = lookup_pfid((*peg)->pfname); + p = intel_pmt_get_regions_by_feature(pfid); + if (IS_ERR_OR_NULL(p)) + continue; + if (enable_events(*peg, p)) { + (*peg)->pfg = p; + ret = true; + } else { + intel_pmt_put_feature_group(p); + } + } + + return ret; +} + +void __exit intel_aet_exit(void) +{ + struct event_group **peg; + + for_each_event_group(peg) { + if ((*peg)->pfg) { + intel_pmt_put_feature_group((*peg)->pfg); + (*peg)->pfg = NULL; + } + } +} + +#define DATA_VALID BIT_ULL(63) +#define DATA_BITS GENMASK_ULL(62, 0) + +/* + * Read counter for an event on a domain (summing all aggregators on the + * domain). If an aggregator hasn't received any data for a specific RMID, + * the MMIO read indicates that data is not valid. Return success if at + * least one aggregator has valid data. + */ +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + struct pmt_event *pevt = arch_priv; + struct event_group *e; + bool valid = false; + u64 total = 0; + u64 evtcount; + void *pevt0; + u32 idx; + + pevt0 = pevt - pevt->idx; + e = container_of(pevt0, struct event_group, evts); + idx = rmid * e->num_events; + idx += pevt->idx; + + if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) { + pr_warn_once("MMIO index %u out of range\n", idx); + return -EIO; + } + + for (int i = 0; i < e->pfg->count; i++) { + if (!e->pfg->regions[i].addr) + continue; + if (e->pfg->regions[i].plat_info.package_id != domid) + continue; + evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64)); + if (!(evtcount & DATA_VALID)) + continue; + total += evtcount & DATA_BITS; + valid = true; + } + + if (valid) + *val = total; + + return valid ? 0 : -EINVAL; +} + +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) +{ + struct rdt_perf_pkg_mon_domain *d; + int err; + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + d->hdr.rid = RDT_RESOURCE_PERF_PKG; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, &d->hdr); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + kfree(d); + } +} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 6da9bd1a188b9..e3cfa0c10e92d 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -49,6 +49,9 @@ struct arch_mbm_state { #define ABMC_EXTENDED_EVT_ID BIT(31) #define ABMC_EVT_ID BIT(0) +/* Setting bit 1 in MSR_IA32_L3_QOS_EXT_CFG enables the SDCIAE feature. */ +#define SDCIAE_ENABLE_BIT 1 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -63,17 +66,17 @@ struct rdt_hw_ctrl_domain { }; /** - * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share - * a resource for a monitor function - * @d_resctrl: Properties exposed to the resctrl file system + * struct rdt_hw_l3_mon_domain - Arch private attributes of a set of CPUs sharing + * RDT_RESOURCE_L3 monitoring + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_states: Per-event pointer to the MBM event's saved state. * An MBM event's state is an array of struct arch_mbm_state * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_mon_domain { - struct rdt_mon_domain d_resctrl; +struct rdt_hw_l3_mon_domain { + struct rdt_l3_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; @@ -82,11 +85,19 @@ static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctr return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); } -static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +static inline struct rdt_hw_l3_mon_domain *resctrl_to_arch_mon_dom(struct rdt_l3_mon_domain *r) { - return container_of(r, struct rdt_hw_mon_domain, d_resctrl); + return container_of(r, struct rdt_hw_l3_mon_domain, d_resctrl); } +/** + * struct rdt_perf_pkg_mon_domain - CPUs sharing an package scoped resctrl monitor resource + * @hdr: common header for different domain types + */ +struct rdt_perf_pkg_mon_domain { + struct rdt_domain_hdr hdr; +}; + /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use @@ -115,6 +126,7 @@ struct msr_param { * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource * @mbm_cntr_assign_enabled: ABMC feature is enabled + * @sdciae_enabled: SDCIAE feature (backing "io_alloc") is enabled. * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -129,6 +141,7 @@ struct rdt_hw_resource { unsigned int mbm_width; bool cdp_enabled; bool mbm_cntr_assign_enabled; + bool sdciae_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) @@ -138,7 +151,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r extern struct rdt_hw_resource rdt_resources_all[]; -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { @@ -211,7 +224,7 @@ union l3_qos_abmc_cfg { void rdt_ctrl_update(void *arg); -int rdt_get_mon_l3_config(struct rdt_resource *r); +int rdt_get_l3_mon_config(struct rdt_resource *r); bool rdt_cpu_has(int flag); @@ -220,4 +233,24 @@ void __init intel_rdt_mbm_apply_quirk(void); void rdt_domain_reconfigure_cdp(struct rdt_resource *r); void resctrl_arch_mbm_cntr_assign_set_one(struct rdt_resource *r); +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET +bool intel_aet_get_events(void); +void __exit intel_aet_exit(void); +int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val); +void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos); +bool intel_handle_aet_option(bool force_off, char *tok); +#else +static inline bool intel_aet_get_events(void) { return false; } +static inline void __exit intel_aet_exit(void) { } +static inline int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val) +{ + return -EINVAL; +} + +static inline void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r, + struct list_head *add_pos) { } +static inline bool intel_handle_aet_option(bool force_off, char *tok) { return false; } +#endif + #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index fe1a2aa53c16a..c10e1ae5ee8a7 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -109,7 +109,7 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) * * In RMID sharing mode there are fewer "logical RMID" values available * to accumulate data ("physical RMIDs" are divided evenly between SNC - * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * nodes that share an L3 cache). Linux creates an rdt_l3_mon_domain for * each SNC node. * * The value loaded into IA32_PQR_ASSOC is the "logical RMID". @@ -157,7 +157,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_l3_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -171,11 +171,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return state ? &state[rmid] : NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u32 prmid; @@ -194,9 +194,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); enum resctrl_event_id eventid; int idx; @@ -217,10 +217,10 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, +static u64 get_corrected_val(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 rmid, enum resctrl_event_id eventid, u64 msr_val) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); struct arch_mbm_state *am; u64 chunks; @@ -238,19 +238,29 @@ static u64 get_corrected_val(struct rdt_resource *r, struct rdt_mon_domain *d, return chunks * hw_res->mon_scale; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 unused, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *ignored) + void *arch_priv, u64 *val, void *ignored) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - int cpu = cpumask_any(&d->hdr.cpu_mask); + struct rdt_hw_l3_mon_domain *hw_dom; + struct rdt_l3_mon_domain *d; struct arch_mbm_state *am; u64 msr_val; u32 prmid; + int cpu; int ret; resctrl_arch_rmid_read_context_check(); + if (r->rid == RDT_RESOURCE_PERF_PKG) + return intel_aet_read_event(hdr->id, rmid, arch_priv, val); + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + cpu = cpumask_any(&hdr->cpu_mask); prmid = logical_rmid_to_physical_rmid(cpu, rmid); ret = __rmid_read_phys(prmid, eventid, &msr_val); @@ -302,11 +312,11 @@ static int __cntr_id_read(u32 cntr_id, u64 *val) return 0; } -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); @@ -318,7 +328,7 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, } } -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 unused, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val) { @@ -348,7 +358,7 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * must adjust RMID counter numbers based on SNC node. See * logical_rmid_to_physical_rmid() for code that does this. */ -void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { if (snc_nodes_per_l3_cache > 1) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); @@ -417,7 +427,7 @@ static __init int snc_get_config(void) return ret; } -int __init rdt_get_mon_l3_config(struct rdt_resource *r) +int __init rdt_get_l3_mon_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); @@ -509,7 +519,7 @@ static void resctrl_abmc_set_one_amd(void *arg) */ static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -548,11 +558,11 @@ static void resctrl_abmc_config_one_amd(void *info) /* * Send an IPI to the domain to assign the counter to RMID, event pair. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { - struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + struct rdt_hw_l3_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); union l3_qos_abmc_cfg abmc_cfg = { 0 }; struct arch_mbm_state *am; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index caa4dc885c214..d113863a8eab0 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -53,6 +53,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_ABMC, CPUID_EBX, 5, 0x80000020, 0 }, + { X86_FEATURE_SDCIAE, CPUID_EBX, 6, 0x80000020, 0 }, { X86_FEATURE_TSA_SQ_NO, CPUID_ECX, 1, 0x80000021, 0 }, { X86_FEATURE_TSA_L1_NO, CPUID_ECX, 2, 0x80000021, 0 }, { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 449950029872a..a1b623744b2f3 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -85,7 +85,7 @@ static inline bool blk_can_dma_map_iova(struct request *req, static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) { - iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); + iter->addr = pci_p2pdma_bus_addr_map(iter->p2pdma.mem, vec->paddr); iter->len = vec->len; return true; } diff --git a/drivers/Kconfig b/drivers/Kconfig index 4915a63866b01..3054b50a2f4cb 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -251,4 +251,6 @@ source "drivers/hte/Kconfig" source "drivers/cdx/Kconfig" +source "drivers/resctrl/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 8e1ffa4358d5f..20eb17596b899 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -194,6 +194,7 @@ obj-$(CONFIG_HTE) += hte/ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_DPLL) += dpll/ +obj-y += resctrl/ obj-$(CONFIG_DIBS) += dibs/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index fc96a5e234f06..75bdd7de0e8ac 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -510,12 +510,6 @@ static bool ghes_do_memory_failure(u64 physical_addr, int flags) return false; pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { - pr_warn_ratelimited(FW_WARN GHES_PFX - "Invalid address in generic error data: %#llx\n", - physical_addr); - return false; - } if (flags == MF_ACTION_REQUIRED && current->mm) { twcb = (void *)gen_pool_alloc(ghes_estatus_pool, sizeof(*twcb)); diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig index b3ed6212244c1..f2fd79f22e7d8 100644 --- a/drivers/acpi/arm64/Kconfig +++ b/drivers/acpi/arm64/Kconfig @@ -21,3 +21,6 @@ config ACPI_AGDI config ACPI_APMT bool + +config ACPI_MPAM + bool diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile index 05ecde9eaabe9..9390b57cb5648 100644 --- a/drivers/acpi/arm64/Makefile +++ b/drivers/acpi/arm64/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI_APMT) += apmt.o obj-$(CONFIG_ACPI_FFH) += ffh.o obj-$(CONFIG_ACPI_GTDT) += gtdt.o obj-$(CONFIG_ACPI_IORT) += iort.o +obj-$(CONFIG_ACPI_MPAM) += mpam.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ARM_AMBA) += amba.o obj-y += dma.o init.o diff --git a/drivers/acpi/arm64/mpam.c b/drivers/acpi/arm64/mpam.c new file mode 100644 index 0000000000000..84963a20c3e78 --- /dev/null +++ b/drivers/acpi/arm64/mpam.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +/* Parse the MPAM ACPI table feeding the discovered nodes into the driver */ + +#define pr_fmt(fmt) "ACPI MPAM: " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Flags for acpi_table_mpam_msc.*_interrupt_flags. + * See 2.1.1 Interrupt Flags, Table 5, of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IRQ_MODE BIT(0) +#define ACPI_MPAM_MSC_IRQ_TYPE_MASK GENMASK(2, 1) +#define ACPI_MPAM_MSC_IRQ_TYPE_WIRED 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK BIT(3) +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR 0 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER 1 +#define ACPI_MPAM_MSC_IRQ_AFFINITY_VALID BIT(4) + +/* + * Encodings for the MSC node body interface type field. + * See 2.1 MPAM MSC node, Table 4 of DEN0065B_MPAM_ACPI_3.0-bet. + */ +#define ACPI_MPAM_MSC_IFACE_MMIO 0x00 +#define ACPI_MPAM_MSC_IFACE_PCC 0x0a + +static bool _is_ppi_partition(u32 flags) +{ + u32 aff_type, is_ppi; + bool ret; + + is_ppi = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_VALID, flags); + if (!is_ppi) + return false; + + aff_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_MASK, flags); + ret = (aff_type == ACPI_MPAM_MSC_IRQ_AFFINITY_TYPE_PROCESSOR_CONTAINER); + if (ret) + pr_err_once("Partitioned interrupts not supported\n"); + + return ret; +} + +static int acpi_mpam_register_irq(struct platform_device *pdev, + u32 intid, u32 flags) +{ + int irq; + u32 int_type; + int trigger; + + if (!intid) + return -EINVAL; + + if (_is_ppi_partition(flags)) + return -EINVAL; + + trigger = FIELD_GET(ACPI_MPAM_MSC_IRQ_MODE, flags); + int_type = FIELD_GET(ACPI_MPAM_MSC_IRQ_TYPE_MASK, flags); + if (int_type != ACPI_MPAM_MSC_IRQ_TYPE_WIRED) + return -EINVAL; + + irq = acpi_register_gsi(&pdev->dev, intid, trigger, ACPI_ACTIVE_HIGH); + if (irq < 0) + pr_err_once("Failed to register interrupt 0x%x with ACPI\n", intid); + + return irq; +} + +static void acpi_mpam_parse_irqs(struct platform_device *pdev, + struct acpi_mpam_msc_node *tbl_msc, + struct resource *res, int *res_idx) +{ + u32 flags, intid; + int irq; + + intid = tbl_msc->overflow_interrupt; + flags = tbl_msc->overflow_interrupt_flags; + irq = acpi_mpam_register_irq(pdev, intid, flags); + if (irq > 0) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "overflow"); + + intid = tbl_msc->error_interrupt; + flags = tbl_msc->error_interrupt_flags; + irq = acpi_mpam_register_irq(pdev, intid, flags); + if (irq > 0) + res[(*res_idx)++] = DEFINE_RES_IRQ_NAMED(irq, "error"); +} + +static int acpi_mpam_parse_resource(struct mpam_msc *msc, + struct acpi_mpam_resource_node *res) +{ + int level, nid; + u32 cache_id; + + switch (res->locator_type) { + case ACPI_MPAM_LOCATION_TYPE_PROCESSOR_CACHE: + cache_id = res->locator.cache_locator.cache_reference; + level = find_acpi_cache_level_from_id(cache_id); + if (level <= 0) { + pr_err_once("Bad level (%d) for cache with id %u\n", level, cache_id); + return -EINVAL; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_CACHE, + level, cache_id); + case ACPI_MPAM_LOCATION_TYPE_MEMORY: + nid = pxm_to_node(res->locator.memory_locator.proximity_domain); + if (nid == NUMA_NO_NODE) { + pr_debug("Bad proximity domain %lld, using node 0 instead\n", + res->locator.memory_locator.proximity_domain); + nid = 0; + } + return mpam_ris_create(msc, res->ris_index, MPAM_CLASS_MEMORY, + MPAM_CLASS_ID_DEFAULT, nid); + default: + /* These get discovered later and are treated as unknown */ + return 0; + } +} + +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + int i, err; + char *ptr, *table_end; + struct acpi_mpam_resource_node *resource; + + table_end = (char *)tbl_msc + tbl_msc->length; + ptr = (char *)(tbl_msc + 1); + for (i = 0; i < tbl_msc->num_resource_nodes; i++) { + u64 max_deps, remaining_table; + + if (ptr + sizeof(*resource) > table_end) + return -EINVAL; + + resource = (struct acpi_mpam_resource_node *)ptr; + + remaining_table = table_end - ptr; + max_deps = remaining_table / sizeof(struct acpi_mpam_func_deps); + if (resource->num_functional_deps > max_deps) { + pr_debug("MSC has impossible number of functional dependencies\n"); + return -EINVAL; + } + + err = acpi_mpam_parse_resource(msc, resource); + if (err) + return err; + + ptr += sizeof(*resource); + ptr += resource->num_functional_deps * sizeof(struct acpi_mpam_func_deps); + } + + return 0; +} + +/* + * Creates the device power management link and returns true if the + * acpi id is valid and usable for cpu affinity. This is the case + * when the linked device is a processor or a processor container. + */ +static bool __init parse_msc_pm_link(struct acpi_mpam_msc_node *tbl_msc, + struct platform_device *pdev, + u32 *acpi_id) +{ + char hid[sizeof(tbl_msc->hardware_id_linked_device) + 1] = { 0 }; + bool acpi_id_valid = false; + struct acpi_device *buddy; + char uid[11]; + int len; + + memcpy(hid, &tbl_msc->hardware_id_linked_device, + sizeof(tbl_msc->hardware_id_linked_device)); + + if (!strcmp(hid, ACPI_PROCESSOR_CONTAINER_HID)) { + *acpi_id = tbl_msc->instance_id_linked_device; + acpi_id_valid = true; + } + + len = snprintf(uid, sizeof(uid), "%u", + tbl_msc->instance_id_linked_device); + if (len >= sizeof(uid)) { + pr_debug("Failed to convert uid of device for power management."); + return acpi_id_valid; + } + + buddy = acpi_dev_get_first_match_dev(hid, uid, -1); + if (buddy) { + device_link_add(&pdev->dev, &buddy->dev, DL_FLAG_STATELESS); + acpi_dev_put(buddy); + } + + return acpi_id_valid; +} + +static int decode_interface_type(struct acpi_mpam_msc_node *tbl_msc, + enum mpam_msc_iface *iface) +{ + switch (tbl_msc->interface_type) { + case ACPI_MPAM_MSC_IFACE_MMIO: + *iface = MPAM_IFACE_MMIO; + return 0; + case ACPI_MPAM_MSC_IFACE_PCC: + *iface = MPAM_IFACE_PCC; + return 0; + default: + return -EINVAL; + } +} + +static struct platform_device * __init acpi_mpam_parse_msc(struct acpi_mpam_msc_node *tbl_msc) +{ + struct platform_device *pdev __free(platform_device_put) = + platform_device_alloc("mpam_msc", tbl_msc->identifier); + int next_res = 0, next_prop = 0, err; + /* pcc, nrdy, affinity and a sentinel */ + struct property_entry props[4] = { 0 }; + /* mmio, 2xirq, no sentinel. */ + struct resource res[3] = { 0 }; + struct acpi_device *companion; + enum mpam_msc_iface iface; + char uid[16]; + u32 acpi_id; + + if (!pdev) + return ERR_PTR(-ENOMEM); + + /* Some power management is described in the namespace: */ + err = snprintf(uid, sizeof(uid), "%u", tbl_msc->identifier); + if (err > 0 && err < sizeof(uid)) { + companion = acpi_dev_get_first_match_dev("ARMHAA5C", uid, -1); + if (companion) { + ACPI_COMPANION_SET(&pdev->dev, companion); + acpi_dev_put(companion); + } else { + pr_debug("MSC.%u: missing namespace entry\n", tbl_msc->identifier); + } + } + + if (decode_interface_type(tbl_msc, &iface)) { + pr_debug("MSC.%u: unknown interface type\n", tbl_msc->identifier); + return ERR_PTR(-EINVAL); + } + + if (iface == MPAM_IFACE_MMIO) { + res[next_res++] = DEFINE_RES_MEM_NAMED(tbl_msc->base_address, + tbl_msc->mmio_size, + "MPAM:MSC"); + } else if (iface == MPAM_IFACE_PCC) { + props[next_prop++] = PROPERTY_ENTRY_U32("pcc-channel", + tbl_msc->base_address); + } + + acpi_mpam_parse_irqs(pdev, tbl_msc, res, &next_res); + + WARN_ON_ONCE(next_res > ARRAY_SIZE(res)); + err = platform_device_add_resources(pdev, res, next_res); + if (err) + return ERR_PTR(err); + + props[next_prop++] = PROPERTY_ENTRY_U32("arm,not-ready-us", + tbl_msc->max_nrdy_usec); + + /* + * The MSC's CPU affinity is described via its linked power + * management device, but only if it points at a Processor or + * Processor Container. + */ + if (parse_msc_pm_link(tbl_msc, pdev, &acpi_id)) + props[next_prop++] = PROPERTY_ENTRY_U32("cpu_affinity", acpi_id); + + WARN_ON_ONCE(next_prop > ARRAY_SIZE(props) - 1); + err = device_create_managed_software_node(&pdev->dev, props, NULL); + if (err) + return ERR_PTR(err); + + /* + * Stash the table entry for acpi_mpam_parse_resources() to discover + * what this MSC controls. + */ + err = platform_device_add_data(pdev, tbl_msc, tbl_msc->length); + if (err) + return ERR_PTR(err); + + err = platform_device_add(pdev); + if (err) + return ERR_PTR(err); + + return_ptr(pdev); +} + +static int __init acpi_mpam_parse(void) +{ + char *table_end, *table_offset; + struct acpi_mpam_msc_node *tbl_msc; + struct platform_device *pdev; + + if (acpi_disabled || !system_supports_mpam()) + return 0; + + struct acpi_table_header *table __free(acpi_put_table) = + acpi_get_table_pointer(ACPI_SIG_MPAM, 0); + + if (IS_ERR(table)) + return 0; + + if (table->revision < 1) { + pr_debug("MPAM ACPI table revision %d not supported\n", table->revision); + return 0; + } + + table_offset = (char *)(table + 1); + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + if (table_offset + sizeof(*tbl_msc) > table_end || + table_offset + tbl_msc->length > table_end) { + pr_err("MSC entry overlaps end of ACPI table\n"); + return -EINVAL; + } + table_offset += tbl_msc->length; + + /* + * If any of the reserved fields are set, make no attempt to + * parse the MSC structure. This MSC will still be counted by + * acpi_mpam_count_msc(), meaning the MPAM driver can't probe + * against all MSC, and will never be enabled. There is no way + * to enable it safely, because we cannot determine safe + * system-wide partid and pmg ranges in this situation. + */ + if (tbl_msc->reserved || tbl_msc->reserved1 || tbl_msc->reserved2) { + pr_err_once("Unrecognised MSC, MPAM not usable\n"); + pr_debug("MSC.%u: reserved field set\n", tbl_msc->identifier); + continue; + } + + if (!tbl_msc->mmio_size) { + pr_debug("MSC.%u: marked as disabled\n", tbl_msc->identifier); + continue; + } + + pdev = acpi_mpam_parse_msc(tbl_msc); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + } + + return 0; +} + +/** + * acpi_mpam_count_msc() - Count the number of MSC described by firmware. + * + * Returns the number of MSCs, or zero for an error. + * + * This can be called before or in parallel with acpi_mpam_parse(). + */ +int acpi_mpam_count_msc(void) +{ + char *table_end, *table_offset; + struct acpi_mpam_msc_node *tbl_msc; + int count = 0; + + if (acpi_disabled || !system_supports_mpam()) + return 0; + + struct acpi_table_header *table __free(acpi_put_table) = + acpi_get_table_pointer(ACPI_SIG_MPAM, 0); + + if (IS_ERR(table)) + return 0; + + if (table->revision < 1) + return 0; + + table_offset = (char *)(table + 1); + table_end = (char *)table + table->length; + + while (table_offset < table_end) { + tbl_msc = (struct acpi_mpam_msc_node *)table_offset; + + if (table_offset + sizeof(*tbl_msc) > table_end) + return -EINVAL; + if (tbl_msc->length < sizeof(*tbl_msc)) + return -EINVAL; + if (tbl_msc->length > table_end - table_offset) + return -EINVAL; + table_offset += tbl_msc->length; + + if (!tbl_msc->mmio_size) + continue; + + count++; + } + + return count; +} + +/* + * Call after ACPI devices have been created, which happens behind acpi_scan_init() + * called from subsys_initcall(). PCC requires the mailbox driver, which is + * initialised from postcore_initcall(). + */ +subsys_initcall_sync(acpi_mpam_parse); diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 54676e3d82dd5..de5f8c018333d 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -21,6 +21,25 @@ #include #include +/* + * The acpi_pptt_cache_v1 in actbl2.h, which is imported from acpica, + * only contains the cache_id field rather than all the fields of the + * Cache Type Structure. Use this alternative structure until it is + * resolved in acpica. + */ +struct acpi_pptt_cache_v1_full { + struct acpi_subtable_header header; + u16 reserved; + u32 flags; + u32 next_level_of_cache; + u32 size; + u32 number_of_sets; + u8 associativity; + u8 attributes; + u16 line_size; + u32 cache_id; +} __packed; + static struct acpi_subtable_header *fetch_pptt_subtable(struct acpi_table_header *table_hdr, u32 pptt_ref) { @@ -56,6 +75,18 @@ static struct acpi_pptt_cache *fetch_pptt_cache(struct acpi_table_header *table_ return (struct acpi_pptt_cache *)fetch_pptt_subtable(table_hdr, pptt_ref); } +static struct acpi_pptt_cache_v1_full *upgrade_pptt_cache(struct acpi_pptt_cache *cache) +{ + if (cache->header.length < sizeof(struct acpi_pptt_cache_v1_full)) + return NULL; + + /* No use for v1 if the only additional field is invalid */ + if (!(cache->flags & ACPI_PPTT_CACHE_ID_VALID)) + return NULL; + + return (struct acpi_pptt_cache_v1_full *)cache; +} + static struct acpi_subtable_header *acpi_get_pptt_resource(struct acpi_table_header *table_hdr, struct acpi_pptt_processor *node, int resource) @@ -177,14 +208,14 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, } /** - * acpi_count_levels() - Given a PPTT table, and a CPU node, count the cache - * levels and split cache levels (data/instruction). + * acpi_count_levels() - Given a PPTT table, and a CPU node, count the + * total number of levels and split cache levels (data/instruction). * @table_hdr: Pointer to the head of the PPTT table * @cpu_node: processor node we wish to count caches for - * @levels: Number of levels if success. * @split_levels: Number of split cache levels (data/instruction) if * success. Can by NULL. * + * Return: number of levels. * Given a processor node containing a processing unit, walk into it and count * how many levels exist solely for it, and then walk up each level until we hit * the root node (ignore the package level because it may be possible to have @@ -192,14 +223,18 @@ acpi_find_cache_level(struct acpi_table_header *table_hdr, * split cache levels (data/instruction) that exist at each level on the way * up. */ -static void acpi_count_levels(struct acpi_table_header *table_hdr, - struct acpi_pptt_processor *cpu_node, - unsigned int *levels, unsigned int *split_levels) +static int acpi_count_levels(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *cpu_node, + unsigned int *split_levels) { + int current_level = 0; + do { - acpi_find_cache_level(table_hdr, cpu_node, levels, split_levels, 0, 0); + acpi_find_cache_level(table_hdr, cpu_node, ¤t_level, split_levels, 0, 0); cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); } while (cpu_node); + + return current_level; } /** @@ -351,7 +386,6 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta * @this_leaf: Kernel cache info structure being updated * @found_cache: The PPTT node describing this cache instance * @cpu_node: A unique reference to describe this cache instance - * @revision: The revision of the PPTT table * * The ACPI spec implies that the fields in the cache structures are used to * extend and correct the information probed from the hardware. Lets only @@ -361,10 +395,9 @@ static struct acpi_pptt_cache *acpi_find_cache_node(struct acpi_table_header *ta */ static void update_cache_properties(struct cacheinfo *this_leaf, struct acpi_pptt_cache *found_cache, - struct acpi_pptt_processor *cpu_node, - u8 revision) + struct acpi_pptt_processor *cpu_node) { - struct acpi_pptt_cache_v1* found_cache_v1; + struct acpi_pptt_cache_v1_full *found_cache_v1; this_leaf->fw_token = cpu_node; if (found_cache->flags & ACPI_PPTT_SIZE_PROPERTY_VALID) @@ -414,9 +447,8 @@ static void update_cache_properties(struct cacheinfo *this_leaf, found_cache->flags & ACPI_PPTT_CACHE_TYPE_VALID) this_leaf->type = CACHE_TYPE_UNIFIED; - if (revision >= 3 && (found_cache->flags & ACPI_PPTT_CACHE_ID_VALID)) { - found_cache_v1 = ACPI_ADD_PTR(struct acpi_pptt_cache_v1, - found_cache, sizeof(struct acpi_pptt_cache)); + found_cache_v1 = upgrade_pptt_cache(found_cache); + if (found_cache_v1) { this_leaf->id = found_cache_v1->cache_id; this_leaf->attributes |= CACHE_ID; } @@ -441,8 +473,7 @@ static void cache_setup_acpi_cpu(struct acpi_table_header *table, pr_debug("found = %p %p\n", found_cache, cpu_node); if (found_cache) update_cache_properties(this_leaf, found_cache, - ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table)), - table->revision); + ACPI_TO_POINTER(ACPI_PTR_DIFF(cpu_node, table))); index++; } @@ -645,7 +676,7 @@ int acpi_get_cache_info(unsigned int cpu, unsigned int *levels, if (!cpu_node) return -ENOENT; - acpi_count_levels(table, cpu_node, levels, split_levels); + *levels = acpi_count_levels(table, cpu_node, split_levels); pr_debug("Cache Setup: last_level=%d split_levels=%d\n", *levels, split_levels ? *split_levels : -1); @@ -817,3 +848,218 @@ int find_acpi_cpu_topology_hetero_id(unsigned int cpu) return find_acpi_cpu_topology_tag(cpu, PPTT_ABORT_PACKAGE, ACPI_PPTT_ACPI_IDENTICAL); } + +/** + * acpi_pptt_get_child_cpus() - Find all the CPUs below a PPTT + * processor hierarchy node + * + * @table_hdr: A reference to the PPTT table + * @parent_node: A pointer to the processor hierarchy node in the + * table_hdr + * @cpus: A cpumask to fill with the CPUs below @parent_node + * + * Walks up the PPTT from every possible CPU to find if the provided + * @parent_node is a parent of this CPU. + */ +static void acpi_pptt_get_child_cpus(struct acpi_table_header *table_hdr, + struct acpi_pptt_processor *parent_node, + cpumask_t *cpus) +{ + struct acpi_pptt_processor *cpu_node; + u32 acpi_id; + int cpu; + + cpumask_clear(cpus); + + for_each_possible_cpu(cpu) { + acpi_id = get_acpi_id_for_cpu(cpu); + cpu_node = acpi_find_processor_node(table_hdr, acpi_id); + + while (cpu_node) { + if (cpu_node == parent_node) { + cpumask_set_cpu(cpu, cpus); + break; + } + cpu_node = fetch_pptt_node(table_hdr, cpu_node->parent); + } + } +} + +/** + * acpi_pptt_get_cpus_from_container() - Populate a cpumask with all CPUs in a + * processor container + * @acpi_cpu_id: The UID of the processor container + * @cpus: The resulting CPU mask + * + * Find the specified Processor Container, and fill @cpus with all the cpus + * below it. + * + * Not all 'Processor Hierarchy' entries in the PPTT are either a CPU + * or a Processor Container, they may exist purely to describe a + * Private resource. CPUs have to be leaves, so a Processor Container + * is a non-leaf that has the 'ACPI Processor ID valid' flag set. + */ +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) +{ + struct acpi_table_header *table_hdr; + struct acpi_subtable_header *entry; + unsigned long table_end; + u32 proc_sz; + + cpumask_clear(cpus); + + table_hdr = acpi_get_pptt(); + if (!table_hdr) + return; + + table_end = (unsigned long)table_hdr + table_hdr->length; + entry = ACPI_ADD_PTR(struct acpi_subtable_header, table_hdr, + sizeof(struct acpi_table_pptt)); + proc_sz = sizeof(struct acpi_pptt_processor); + while ((unsigned long)entry + proc_sz <= table_end) { + if (entry->type == ACPI_PPTT_TYPE_PROCESSOR) { + struct acpi_pptt_processor *cpu_node; + + cpu_node = (struct acpi_pptt_processor *)entry; + if (cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID && + !acpi_pptt_leaf_node(table_hdr, cpu_node) && + cpu_node->acpi_processor_id == acpi_cpu_id) { + acpi_pptt_get_child_cpus(table_hdr, cpu_node, cpus); + break; + } + } + entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, + entry->length); + } +} + +/** + * find_acpi_cache_level_from_id() - Get the level of the specified cache + * @cache_id: The id field of the cache + * + * Determine the level relative to any CPU for the cache identified by + * cache_id. This allows the property to be found even if the CPUs are offline. + * + * The returned level can be used to group caches that are peers. + * + * The PPTT table must be rev 3 or later. + * + * If one CPU's L2 is shared with another CPU as L3, this function will return + * an unpredictable value. + * + * Return: -ENOENT if the PPTT doesn't exist, the revision isn't supported or + * the cache cannot be found. + * Otherwise returns a value which represents the level of the specified cache. + */ +int find_acpi_cache_level_from_id(u32 cache_id) +{ + int cpu; + struct acpi_table_header *table; + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + bool empty; + int level = 1; + u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu); + struct acpi_pptt_cache *cache; + struct acpi_pptt_processor *cpu_node; + + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + do { + int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED}; + + empty = true; + for (int i = 0; i < ARRAY_SIZE(cache_type); i++) { + struct acpi_pptt_cache_v1_full *cache_v1; + + cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i], + level, &cpu_node); + if (!cache) + continue; + + empty = false; + + cache_v1 = upgrade_pptt_cache(cache); + if (cache_v1 && cache_v1->cache_id == cache_id) + return level; + } + level++; + } while (!empty); + } + + return -ENOENT; +} + +/** + * acpi_pptt_get_cpumask_from_cache_id() - Get the cpus associated with the + * specified cache + * @cache_id: The id field of the cache + * @cpus: Where to build the cpumask + * + * Determine which CPUs are below this cache in the PPTT. This allows the property + * to be found even if the CPUs are offline. + * + * The PPTT table must be rev 3 or later, + * + * Return: -ENOENT if the PPTT doesn't exist, or the cache cannot be found. + * Otherwise returns 0 and sets the cpus in the provided cpumask. + */ +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus) +{ + int cpu; + struct acpi_table_header *table; + + cpumask_clear(cpus); + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; + + if (table->revision < 3) + return -ENOENT; + + for_each_possible_cpu(cpu) { + bool empty; + int level = 1; + u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu); + struct acpi_pptt_cache *cache; + struct acpi_pptt_processor *cpu_node; + + cpu_node = acpi_find_processor_node(table, acpi_cpu_id); + if (!cpu_node) + continue; + + do { + int cache_type[] = {CACHE_TYPE_INST, CACHE_TYPE_DATA, CACHE_TYPE_UNIFIED}; + + empty = true; + for (int i = 0; i < ARRAY_SIZE(cache_type); i++) { + struct acpi_pptt_cache_v1_full *cache_v1; + + cache = acpi_find_cache_node(table, acpi_cpu_id, cache_type[i], + level, &cpu_node); + + if (!cache) + continue; + + empty = false; + + cache_v1 = upgrade_pptt_cache(cache); + if (cache_v1 && cache_v1->cache_id == cache_id) + cpumask_set_cpu(cpu, cpus); + } + level++; + } while (!empty); + } + + return 0; +} diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 57fc8bc56166b..4286e4af10924 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -408,7 +408,7 @@ static const char table_sigs[][ACPI_NAMESEG_SIZE] __nonstring_array __initconst ACPI_SIG_PSDT, ACPI_SIG_RSDT, ACPI_SIG_XSDT, ACPI_SIG_SSDT, ACPI_SIG_IORT, ACPI_SIG_NFIT, ACPI_SIG_HMAT, ACPI_SIG_PPTT, ACPI_SIG_NHLT, ACPI_SIG_AEST, ACPI_SIG_CEDT, ACPI_SIG_AGDI, - ACPI_SIG_NBFT, ACPI_SIG_SWFT}; + ACPI_SIG_NBFT, ACPI_SIG_SWFT, ACPI_SIG_MPAM}; #define ACPI_HEADER_SIZE sizeof(struct acpi_table_header) diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c5..2008fb7481b35 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-fence-unwrap.o dma-resv.o + dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o obj-$(CONFIG_DMABUF_HEAPS) += heaps/ obj-$(CONFIG_SYNC_FILE) += sync_file.o diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c new file mode 100644 index 0000000000000..b7352e609fbdf --- /dev/null +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA BUF Mapping Helpers + * + */ +#include +#include + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(size_t, length, UINT_MAX); + length -= len; + /* + * DMABUF abuses scatterlist to create a scatterlist + * that does not have any CPU list, only the DMA list. + * Always set the page related values to NULL to ensure + * importers can't use it. The phys_addr based DMA API + * does not require the CPU list for mapping or unmapping. + */ + sg_set_page(sgl, NULL, 0, 0); + sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct dma_iova_state *state, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size) +{ + unsigned int nents = 0; + size_t i; + + if (!state || !dma_use_iova(state)) { + for (i = 0; i < nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + } else { + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(size, UINT_MAX); + } + + return nents; +} + +/** + * struct dma_buf_dma - holds DMA mapping information + * @sgt: Scatter-gather table + * @state: DMA IOVA state relevant in IOMMU-based DMA + * @size: Total size of DMA transfer + */ +struct dma_buf_dma { + struct sg_table sgt; + struct dma_iova_state *state; + size_t size; +}; + +/** + * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment + * from arrays of physical vectors. This funciton is intended for MMIO memory + * only. + * @attach: [in] attachment whose scatterlist is to be returned + * @provider: [in] p2pdma provider + * @phys_vec: [in] array of physical vectors + * @nr_ranges: [in] number of entries in phys_vec array + * @size: [in] total size of phys_vec + * @dir: [in] direction of DMA transfer + * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. May return -EINTR if it is interrupted by a signal. + * + * On success, the DMA addresses and lengths in the returned scatterlist are + * PAGE_SIZE aligned. + * + * A mapping must be unmapped by using dma_buf_free_sgt(). + * + * NOTE: This function is intended for exporters. If direct traffic routing is + * mandatory exporter should call routing pci_p2pdma_map_type() before calling + * this function. + */ +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir) +{ + unsigned int nents, mapped_len = 0; + struct dma_buf_dma *dma; + struct scatterlist *sgl; + dma_addr_t addr; + size_t i; + int ret; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (WARN_ON(!attach || !attach->dmabuf || !provider)) + /* This function is supposed to work on MMIO memory only */ + return ERR_PTR(-EINVAL); + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) + return ERR_PTR(-ENOMEM); + + switch (pci_p2pdma_map_type(provider, attach->dev)) { + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + */ + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL); + if (!dma->state) { + ret = -ENOMEM; + goto err_free_dma; + } + + dma_iova_try_alloc(attach->dev, dma->state, 0, size); + break; + default: + ret = -EINVAL; + goto err_free_dma; + } + + nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size); + ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_free_state; + + sgl = dma->sgt.sgl; + + for (i = 0; i < nr_ranges; i++) { + if (!dma->state) { + addr = pci_p2pdma_bus_addr_map(provider, + phys_vec[i].paddr); + } else if (dma_use_iova(dma->state)) { + ret = dma_iova_link(attach->dev, dma->state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attach->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + ret = dma_mapping_error(attach->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!dma->state || !dma_use_iova(dma->state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (dma->state && dma_use_iova(dma->state)) { + WARN_ON_ONCE(mapped_len != size); + ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + + sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr); + } + + dma->size = size; + + /* + * No CPU list included — set orig_nents = 0 so others can detect + * this via SG table (use nents only). + */ + dma->sgt.orig_nents = 0; + + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return &dma->sgt; + +err_unmap_dma: + if (!i || !dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, mapped_len, dir, + DMA_ATTR_MMIO); + } else { + for_each_sgtable_dma_sg(&dma->sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + sg_free_table(&dma->sgt); +err_free_state: + kfree(dma->state); +err_free_dma: + kfree(dma); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF"); + +/** + * dma_buf_free_sgt- unmaps the buffer + * @attach: [in] attachment to unmap buffer from + * @sgt: [in] scatterlist info of the buffer to unmap + * @dir: [in] direction of DMA transfer + * + * This unmaps a DMA mapping for @attached obtained + * by dma_buf_phys_vec_to_sgt(). + */ +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt); + int i; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (!dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, dma->size, dir, + DMA_ATTR_MMIO); + } else { + struct scatterlist *sgl; + + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + + sg_free_table(sgt); + kfree(dma->state); + kfree(dma); + +} +EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF"); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index dfbd09e2e0ca5..f56824461b58f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2848,6 +2848,12 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return amd_iommu_hd_support(iommu); } + case IOMMU_CAP_PCI_ATS_SUPPORTED: { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + + return amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); + } default: break; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 2557bf561f4bf..e0d07c4ec7f09 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -107,6 +107,7 @@ static const char * const event_class_str[] = { }; static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); +static bool arm_smmu_ats_supported(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) { @@ -2518,6 +2519,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DIRTY_TRACKING: return arm_smmu_dbm_capable(master->smmu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return arm_smmu_ats_supported(master); default: return false; } diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f1fb27681b0bd..c92088855450a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 79676188f60f3..bf2e7520eaeac 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3759,6 +3759,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return info->ats_supported; default: return false; } diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4c842368289f0..e9d012ad4010d 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1624,6 +1624,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + /* Report when ATS cannot be used for this device */ + if (!device_iommu_capable(idev->dev, IOMMU_CAP_PCI_ATS_SUPPORTED)) + cmd->out_capabilities |= IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED; + cmd->out_max_pasid_log2 = 0; /* * Currently, all iommu drivers enable PASID in the probe_device() diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index a4d7a9d435603..dc26019b0dbfd 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,8 +8,10 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include #include #include +#include #include #include #include @@ -284,6 +286,9 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, case IOPT_ADDRESS_FILE: start = elm->start_byte + elm->pages->start; break; + case IOPT_ADDRESS_DMABUF: + start = elm->start_byte + elm->pages->dmabuf.start; + break; } rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) @@ -468,25 +473,53 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, * @iopt: io_pagetable to act on * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains * the chosen iova on output. Otherwise is the iova to map to on input - * @file: file to map + * @fd: fdno of a file to map * @start: map file starting at this byte offset * @length: Number of bytes to map * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping * @flags: IOPT_ALLOC_IOVA or zero */ int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, - unsigned long start, unsigned long length, - int iommu_prot, unsigned int flags) + unsigned long *iova, int fd, unsigned long start, + unsigned long length, int iommu_prot, + unsigned int flags) { struct iopt_pages *pages; + struct dma_buf *dmabuf; + unsigned long start_byte; + unsigned long last; + + if (!length) + return -EINVAL; + if (check_add_overflow(start, length - 1, &last)) + return -EOVERFLOW; + + start_byte = start - ALIGN_DOWN(start, PAGE_SIZE); + dmabuf = dma_buf_get(fd); + if (!IS_ERR(dmabuf)) { + pages = iopt_alloc_dmabuf_pages(ictx, dmabuf, start_byte, start, + length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) { + dma_buf_put(dmabuf); + return PTR_ERR(pages); + } + } else { + struct file *file; + + file = fget(fd); + if (!file) + return -EBADF; + + pages = iopt_alloc_file_pages(file, start_byte, start, length, + iommu_prot & IOMMU_WRITE); + fput(file); + if (IS_ERR(pages)) + return PTR_ERR(pages); + } - pages = iopt_alloc_file_pages(file, start, length, - iommu_prot & IOMMU_WRITE); - if (IS_ERR(pages)) - return PTR_ERR(pages); return iopt_map_common(ictx, iopt, pages, iova, length, - start - pages->start, iommu_prot, flags); + start_byte, iommu_prot, flags); } struct iova_bitmap_fn_arg { @@ -971,9 +1004,15 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(!area->storage_domain); if (area->storage_domain == domain) area->storage_domain = storage_domain; + if (iopt_is_dmabuf(pages)) { + if (!iopt_dmabuf_revoked(pages)) + iopt_area_unmap_domain(area, domain); + iopt_dmabuf_untrack_domain(pages, area, domain); + } mutex_unlock(&pages->mutex); - iopt_area_unmap_domain(area, domain); + if (!iopt_is_dmabuf(pages)) + iopt_area_unmap_domain(area, domain); } return; } @@ -990,6 +1029,8 @@ static void iopt_unfill_domain(struct io_pagetable *iopt, WARN_ON(area->storage_domain != domain); area->storage_domain = NULL; iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } } @@ -1019,10 +1060,16 @@ static int iopt_fill_domain(struct io_pagetable *iopt, if (!pages) continue; - mutex_lock(&pages->mutex); + guard(mutex)(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto out_unfill; + } rc = iopt_area_fill_domain(area, domain); if (rc) { - mutex_unlock(&pages->mutex); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); goto out_unfill; } if (!area->storage_domain) { @@ -1031,7 +1078,6 @@ static int iopt_fill_domain(struct io_pagetable *iopt, interval_tree_insert(&area->pages_node, &pages->domains_itree); } - mutex_unlock(&pages->mutex); } return 0; @@ -1052,6 +1098,8 @@ static int iopt_fill_domain(struct io_pagetable *iopt, area->storage_domain = NULL; } iopt_area_unfill_domain(area, pages, domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_domain(pages, area, domain); mutex_unlock(&pages->mutex); } return rc; @@ -1262,6 +1310,10 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (!pages || area->prevent_access) return -EBUSY; + /* Maintaining the domains_itree below is a bit complicated */ + if (iopt_is_dmabuf(pages)) + return -EOPNOTSUPP; + if (new_start & (alignment - 1) || iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index b6064f4ce4af9..14cd052fd3204 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -5,6 +5,7 @@ #ifndef __IO_PAGETABLE_H #define __IO_PAGETABLE_H +#include #include #include #include @@ -69,6 +70,16 @@ void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain); +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain); +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain); +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages); +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages); + static inline unsigned long iopt_area_index(struct iopt_area *area) { return area->pages_node.start; @@ -179,7 +190,22 @@ enum { enum iopt_address_type { IOPT_ADDRESS_USER = 0, - IOPT_ADDRESS_FILE = 1, + IOPT_ADDRESS_FILE, + IOPT_ADDRESS_DMABUF, +}; + +struct iopt_pages_dmabuf_track { + struct iommu_domain *domain; + struct iopt_area *area; + struct list_head elm; +}; + +struct iopt_pages_dmabuf { + struct dma_buf_attachment *attach; + struct dma_buf_phys_vec phys; + /* Always PAGE_SIZE aligned */ + unsigned long start; + struct list_head tracker; }; /* @@ -209,6 +235,8 @@ struct iopt_pages { struct file *file; unsigned long start; }; + /* IOPT_ADDRESS_DMABUF */ + struct iopt_pages_dmabuf dmabuf; }; bool writable:1; u8 account_mode; @@ -220,10 +248,32 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; +static inline bool iopt_is_dmabuf(struct iopt_pages *pages) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return false; + return pages->type == IOPT_ADDRESS_DMABUF; +} + +static inline bool iopt_dmabuf_revoked(struct iopt_pages *pages) +{ + lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) + return pages->dmabuf.phys.len == 0; + return false; +} + struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, unsigned long length, bool writable); -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 459a7c5169154..f4721afedadcf 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -207,7 +207,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) unsigned long iova = cmd->iova; struct iommufd_ioas *ioas; unsigned int flags = 0; - struct file *file; int rc; if (cmd->flags & @@ -229,11 +228,7 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) flags = IOPT_ALLOC_IOVA; - file = fget(cmd->fd); - if (!file) - return -EBADF; - - rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, cmd->fd, cmd->start, cmd->length, conv_iommu_prot(cmd->flags), flags); if (rc) @@ -243,7 +238,6 @@ int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_put: iommufd_put_object(ucmd->ictx, &ioas->obj); - fput(file); return rc; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 85d0843ed07b0..eb6d1a70f6732 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,8 @@ struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; +struct dma_buf_attachment; +struct dma_buf_phys_vec; struct iommufd_sw_msi_map { struct list_head sw_msi_item; @@ -108,7 +110,7 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, - unsigned long *iova, struct file *file, + unsigned long *iova, int fd, unsigned long start, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, @@ -504,6 +506,8 @@ void iommufd_device_pre_destroy(struct iommufd_object *obj); void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); +struct device *iommufd_global_device(void); + struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; @@ -713,6 +717,8 @@ bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, @@ -734,5 +740,11 @@ static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } +static inline int +iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 8fc618b2bcf96..9166c39eb0c8b 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -29,6 +29,8 @@ enum { IOMMU_TEST_OP_PASID_REPLACE, IOMMU_TEST_OP_PASID_DETACH, IOMMU_TEST_OP_PASID_CHECK_HWPT, + IOMMU_TEST_OP_DMABUF_GET, + IOMMU_TEST_OP_DMABUF_REVOKE, }; enum { @@ -176,6 +178,14 @@ struct iommu_test_cmd { __u32 hwpt_id; /* @id is stdev_id */ } pasid_check; + struct { + __u32 length; + __u32 open_flags; + } dmabuf_get; + struct { + __s32 dmabuf_fd; + __u32 revoked; + } dmabuf_revoke; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ce775fbbae94e..5cc4b08c25f58 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -751,6 +751,15 @@ static struct miscdevice vfio_misc_dev = { .mode = 0666, }; +/* + * Used only by DMABUF, returns a valid struct device to use as a dummy struct + * device for attachment. + */ +struct device *iommufd_global_device(void) +{ + return iommu_misc_dev.this_device; +} + static int __init iommufd_init(void) { int ret; @@ -794,5 +803,6 @@ MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_IMPORT_NS("IOMMUFD"); +MODULE_IMPORT_NS("DMA_BUF"); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index c3433b8455617..dbe51ecb9a20f 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,8 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include +#include #include #include #include @@ -53,6 +55,7 @@ #include #include #include +#include #include "double_span.h" #include "io_pagetable.h" @@ -258,6 +261,11 @@ static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, return container_of(node, struct iopt_area, pages_node); } +enum batch_kind { + BATCH_CPU_MEMORY = 0, + BATCH_MMIO, +}; + /* * A simple datastructure to hold a vector of PFNs, optimized for contiguous * PFNs. This is used as a temporary holding memory for shuttling pfns from one @@ -271,7 +279,9 @@ struct pfn_batch { unsigned int array_size; unsigned int end; unsigned int total_pfns; + enum batch_kind kind; }; +enum { MAX_NPFNS = type_max(typeof(((struct pfn_batch *)0)->npfns[0])) }; static void batch_clear(struct pfn_batch *batch) { @@ -348,11 +358,17 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) } static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, - u32 nr) + u32 nr, enum batch_kind kind) { - const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); unsigned int end = batch->end; + if (batch->kind != kind) { + /* One kind per batch */ + if (batch->end != 0) + return false; + batch->kind = kind; + } + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && nr <= MAX_NPFNS - batch->npfns[end - 1]) { batch->npfns[end - 1] += nr; @@ -379,7 +395,7 @@ static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) /* true if the pfn was added, false otherwise */ static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) { - return batch_add_pfn_num(batch, pfn, 1); + return batch_add_pfn_num(batch, pfn, 1, BATCH_CPU_MEMORY); } /* @@ -492,6 +508,7 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, { bool disable_large_pages = area->iopt->disable_large_pages; unsigned long last_iova = iopt_area_last_iova(area); + int iommu_prot = area->iommu_prot; unsigned int page_offset = 0; unsigned long start_iova; unsigned long next_iova; @@ -499,6 +516,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, unsigned long iova; int rc; + if (batch->kind == BATCH_MMIO) { + iommu_prot &= ~IOMMU_CACHE; + iommu_prot |= IOMMU_MMIO; + } + /* The first index might be a partial page */ if (start_index == iopt_area_index(area)) page_offset = area->page_offset; @@ -512,11 +534,11 @@ static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, rc = batch_iommu_map_small( domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot); + next_iova - iova, iommu_prot); else rc = iommu_map(domain, iova, PFN_PHYS(batch->pfns[cur]) + page_offset, - next_iova - iova, area->iommu_prot, + next_iova - iova, iommu_prot, GFP_KERNEL_ACCOUNT); if (rc) goto err_unmap; @@ -652,7 +674,7 @@ static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, nr = min(nr, npages); npages -= nr; - if (!batch_add_pfn_num(batch, pfn, nr)) + if (!batch_add_pfn_num(batch, pfn, nr, BATCH_CPU_MEMORY)) break; if (nr > 1) { rc = folio_add_pins(folio, nr - 1); @@ -1054,6 +1076,41 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, return iopt_pages_update_pinned(pages, npages, inc, user); } +struct pfn_reader_dmabuf { + struct dma_buf_phys_vec phys; + unsigned long start_offset; +}; + +static int pfn_reader_dmabuf_init(struct pfn_reader_dmabuf *dmabuf, + struct iopt_pages *pages) +{ + /* Callers must not get here if the dmabuf was already revoked */ + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return -EINVAL; + + dmabuf->phys = pages->dmabuf.phys; + dmabuf->start_offset = pages->dmabuf.start; + return 0; +} + +static int pfn_reader_fill_dmabuf(struct pfn_reader_dmabuf *dmabuf, + struct pfn_batch *batch, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start = dmabuf->start_offset + start_index * PAGE_SIZE; + + /* + * start/last_index and start are all PAGE_SIZE aligned, the batch is + * always filled using page size aligned PFNs just like the other types. + * If the dmabuf has been sliced on a sub page offset then the common + * batch to domain code will adjust it before mapping to the domain. + */ + batch_add_pfn_num(batch, PHYS_PFN(dmabuf->phys.paddr + start), + last_index - start_index + 1, BATCH_MMIO); + return 0; +} + /* * PFNs are stored in three places, in order of preference: * - The iopt_pages xarray. This is only populated if there is a @@ -1072,7 +1129,10 @@ struct pfn_reader { unsigned long batch_end_index; unsigned long last_index; - struct pfn_reader_user user; + union { + struct pfn_reader_user user; + struct pfn_reader_dmabuf dmabuf; + }; }; static int pfn_reader_update_pinned(struct pfn_reader *pfns) @@ -1108,7 +1168,7 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; unsigned long npages; struct iopt_area *area; int rc; @@ -1140,8 +1200,13 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return 0; } - if (start_index >= pfns->user.upages_end) { - rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + if (iopt_is_dmabuf(pfns->pages)) + return pfn_reader_fill_dmabuf(&pfns->dmabuf, &pfns->batch, + start_index, span->last_hole); + + user = &pfns->user; + if (start_index >= user->upages_end) { + rc = pfn_reader_user_pin(user, pfns->pages, start_index, span->last_hole); if (rc) return rc; @@ -1209,7 +1274,10 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, pfns->batch_start_index = start_index; pfns->batch_end_index = start_index; pfns->last_index = last_index; - pfn_reader_user_init(&pfns->user, pages); + if (iopt_is_dmabuf(pages)) + pfn_reader_dmabuf_init(&pfns->dmabuf, pages); + else + pfn_reader_user_init(&pfns->user, pages); rc = batch_init(&pfns->batch, last_index - start_index + 1); if (rc) return rc; @@ -1230,8 +1298,12 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; - struct pfn_reader_user *user = &pfns->user; + struct pfn_reader_user *user; + + if (iopt_is_dmabuf(pages)) + return; + user = &pfns->user; if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ @@ -1261,7 +1333,8 @@ static void pfn_reader_destroy(struct pfn_reader *pfns) struct iopt_pages *pages = pfns->pages; pfn_reader_release_pins(pfns); - pfn_reader_user_destroy(&pfns->user, pfns->pages); + if (!iopt_is_dmabuf(pfns->pages)) + pfn_reader_user_destroy(&pfns->user, pfns->pages); batch_destroy(&pfns->batch, NULL); WARN_ON(pages->last_npinned != pages->npinned); } @@ -1340,26 +1413,234 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, return pages; } -struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, +struct iopt_pages *iopt_alloc_file_pages(struct file *file, + unsigned long start_byte, + unsigned long start, unsigned long length, bool writable) { struct iopt_pages *pages; - unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); - unsigned long end; - if (length && check_add_overflow(start, length - 1, &end)) - return ERR_PTR(-EOVERFLOW); - - pages = iopt_alloc_pages(start - start_down, length, writable); + pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) return pages; pages->file = get_file(file); - pages->start = start_down; + pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; return pages; } +static void iopt_revoke_notify(struct dma_buf_attachment *attach) +{ + struct iopt_pages *pages = attach->importer_priv; + struct iopt_pages_dmabuf_track *track; + + guard(mutex)(&pages->mutex); + if (iopt_dmabuf_revoked(pages)) + return; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + struct iopt_area *area = track->area; + + iopt_area_unmap_domain_range(area, track->domain, + iopt_area_index(area), + iopt_area_last_index(area)); + } + pages->dmabuf.phys.len = 0; +} + +static struct dma_buf_attach_ops iopt_dmabuf_attach_revoke_ops = { + .allow_peer2peer = true, + .move_notify = iopt_revoke_notify, +}; + +/* + * iommufd and vfio have a circular dependency. Future work for a phys + * based private interconnect will remove this. + */ +static int +sym_vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + typeof(&vfio_pci_dma_buf_iommufd_map) fn; + int rc; + + rc = iommufd_test_dma_buf_iommufd_map(attachment, phys); + if (rc != -EOPNOTSUPP) + return rc; + + if (!IS_ENABLED(CONFIG_VFIO_PCI_DMABUF)) + return -EOPNOTSUPP; + + fn = symbol_get(vfio_pci_dma_buf_iommufd_map); + if (!fn) + return -EOPNOTSUPP; + rc = fn(attachment, phys); + symbol_put(vfio_pci_dma_buf_iommufd_map); + return rc; +} + +static int iopt_map_dmabuf(struct iommufd_ctx *ictx, struct iopt_pages *pages, + struct dma_buf *dmabuf) +{ + struct dma_buf_attachment *attach; + int rc; + + attach = dma_buf_dynamic_attach(dmabuf, iommufd_global_device(), + &iopt_dmabuf_attach_revoke_ops, pages); + if (IS_ERR(attach)) + return PTR_ERR(attach); + + dma_resv_lock(dmabuf->resv, NULL); + /* + * Lock ordering requires the mutex to be taken inside the reservation, + * make sure lockdep sees this. + */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + mutex_lock(&pages->mutex); + mutex_unlock(&pages->mutex); + } + + rc = sym_vfio_pci_dma_buf_iommufd_map(attach, &pages->dmabuf.phys); + if (rc) + goto err_detach; + + dma_resv_unlock(dmabuf->resv); + + /* On success iopt_release_pages() will detach and put the dmabuf. */ + pages->dmabuf.attach = attach; + return 0; + +err_detach: + dma_resv_unlock(dmabuf->resv); + dma_buf_detach(dmabuf, attach); + return rc; +} + +struct iopt_pages *iopt_alloc_dmabuf_pages(struct iommufd_ctx *ictx, + struct dma_buf *dmabuf, + unsigned long start_byte, + unsigned long start, + unsigned long length, bool writable) +{ + static struct lock_class_key pages_dmabuf_mutex_key; + struct iopt_pages *pages; + int rc; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return ERR_PTR(-EOPNOTSUPP); + + if (dmabuf->size <= (start + length - 1) || + length / PAGE_SIZE >= MAX_NPFNS) + return ERR_PTR(-EINVAL); + + pages = iopt_alloc_pages(start_byte, length, writable); + if (IS_ERR(pages)) + return pages; + + /* + * The mmap_lock can be held when obtaining the dmabuf reservation lock + * which creates a locking cycle with the pages mutex which is held + * while obtaining the mmap_lock. This locking path is not present for + * IOPT_ADDRESS_DMABUF so split the lock class. + */ + lockdep_set_class(&pages->mutex, &pages_dmabuf_mutex_key); + + /* dmabuf does not use pinned page accounting. */ + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + pages->type = IOPT_ADDRESS_DMABUF; + pages->dmabuf.start = start - start_byte; + INIT_LIST_HEAD(&pages->dmabuf.tracker); + + rc = iopt_map_dmabuf(ictx, pages, dmabuf); + if (rc) { + iopt_put_pages(pages); + return ERR_PTR(rc); + } + + return pages; +} + +int iopt_dmabuf_track_domain(struct iopt_pages *pages, struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + if (WARN_ON(!iopt_is_dmabuf(pages))) + return -EINVAL; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->domain == domain && track->area == area)) + return -EINVAL; + + track = kzalloc(sizeof(*track), GFP_KERNEL); + if (!track) + return -ENOMEM; + track->domain = domain; + track->area = area; + list_add_tail(&track->elm, &pages->dmabuf.tracker); + + return 0; +} + +void iopt_dmabuf_untrack_domain(struct iopt_pages *pages, + struct iopt_area *area, + struct iommu_domain *domain) +{ + struct iopt_pages_dmabuf_track *track; + + lockdep_assert_held(&pages->mutex); + WARN_ON(!iopt_is_dmabuf(pages)); + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) { + if (track->domain == domain && track->area == area) { + list_del(&track->elm); + kfree(track); + return; + } + } + WARN_ON(true); +} + +int iopt_dmabuf_track_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iommu_domain *domain; + unsigned long index; + int rc; + + list_for_each_entry(track, &pages->dmabuf.tracker, elm) + if (WARN_ON(track->area == area)) + return -EINVAL; + + xa_for_each(&area->iopt->domains, index, domain) { + rc = iopt_dmabuf_track_domain(pages, area, domain); + if (rc) + goto err_untrack; + } + return 0; +err_untrack: + iopt_dmabuf_untrack_all_domains(area, pages); + return rc; +} + +void iopt_dmabuf_untrack_all_domains(struct iopt_area *area, + struct iopt_pages *pages) +{ + struct iopt_pages_dmabuf_track *track; + struct iopt_pages_dmabuf_track *tmp; + + list_for_each_entry_safe(track, tmp, &pages->dmabuf.tracker, + elm) { + if (track->area == area) { + list_del(&track->elm); + kfree(track); + } + } +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1372,8 +1653,15 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages) && pages->dmabuf.attach) { + struct dma_buf *dmabuf = pages->dmabuf.attach->dmabuf; + + dma_buf_detach(dmabuf, pages->dmabuf.attach); + dma_buf_put(dmabuf); + WARN_ON(!list_empty(&pages->dmabuf.tracker)); + } else if (pages->type == IOPT_ADDRESS_FILE) { fput(pages->file); + } kfree(pages); } @@ -1451,6 +1739,14 @@ static void __iopt_area_unfill_domain(struct iopt_area *area, lockdep_assert_held(&pages->mutex); + if (iopt_is_dmabuf(pages)) { + if (WARN_ON(iopt_dmabuf_revoked(pages))) + return; + iopt_area_unmap_domain_range(area, domain, start_index, + last_index); + return; + } + /* * For security we must not unpin something that is still DMA mapped, * so this must unmap any IOVA before we go ahead and unpin the pages. @@ -1526,6 +1822,9 @@ void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, struct iommu_domain *domain) { + if (iopt_dmabuf_revoked(pages)) + return; + __iopt_area_unfill_domain(area, pages, domain, iopt_area_last_index(area)); } @@ -1546,6 +1845,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) lockdep_assert_held(&area->pages->mutex); + if (iopt_dmabuf_revoked(area->pages)) + return 0; + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), iopt_area_last_index(area)); if (rc) @@ -1605,33 +1907,44 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) return 0; mutex_lock(&pages->mutex); - rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), - iopt_area_last_index(area)); - if (rc) - goto out_unlock; + if (iopt_is_dmabuf(pages)) { + rc = iopt_dmabuf_track_all_domains(area, pages); + if (rc) + goto out_unlock; + } - while (!pfn_reader_done(&pfns)) { - done_first_end_index = pfns.batch_end_index; - done_all_end_index = pfns.batch_start_index; - xa_for_each(&area->iopt->domains, index, domain) { - rc = batch_to_domain(&pfns.batch, domain, area, - pfns.batch_start_index); + if (!iopt_dmabuf_revoked(pages)) { + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_untrack; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); if (rc) goto out_unmap; } - done_all_end_index = done_first_end_index; - - rc = pfn_reader_next(&pfns); + rc = pfn_reader_update_pinned(&pfns); if (rc) goto out_unmap; + + pfn_reader_destroy(&pfns); } - rc = pfn_reader_update_pinned(&pfns); - if (rc) - goto out_unmap; area->storage_domain = xa_load(&area->iopt->domains, 0); interval_tree_insert(&area->pages_node, &pages->domains_itree); - goto out_destroy; + mutex_unlock(&pages->mutex); + return 0; out_unmap: pfn_reader_release_pins(&pfns); @@ -1658,8 +1971,10 @@ int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) end_index); } } -out_destroy: pfn_reader_destroy(&pfns); +out_untrack: + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); out_unlock: mutex_unlock(&pages->mutex); return rc; @@ -1685,16 +2000,22 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) if (!area->storage_domain) goto out_unlock; - xa_for_each(&iopt->domains, index, domain) - if (domain != area->storage_domain) + xa_for_each(&iopt->domains, index, domain) { + if (domain == area->storage_domain) + continue; + + if (!iopt_dmabuf_revoked(pages)) iopt_area_unmap_domain_range( area, domain, iopt_area_index(area), iopt_area_last_index(area)); + } if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); + if (iopt_is_dmabuf(pages)) + iopt_dmabuf_untrack_all_domains(area, pages); area->storage_domain = NULL; out_unlock: mutex_unlock(&pages->mutex); @@ -2031,15 +2352,14 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; - if (pages->type == IOPT_ADDRESS_FILE) + if (iopt_is_dmabuf(pages)) + return -EINVAL; + + if (pages->type != IOPT_ADDRESS_USER) return iopt_pages_rw_slow(pages, start_index, last_index, start_byte % PAGE_SIZE, data, length, flags); - if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(pages->type != IOPT_ADDRESS_USER)) - return -EINVAL; - if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 8fd27f65d6237..415d255ad7276 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #include #include #include @@ -2037,6 +2039,140 @@ void iommufd_selftest_destroy(struct iommufd_object *obj) } } +struct iommufd_test_dma_buf { + void *memory; + size_t length; + bool revoked; +}; + +static int iommufd_test_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + return 0; +} + +static void iommufd_test_dma_buf_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ +} + +static struct sg_table * +iommufd_test_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static void iommufd_test_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ +} + +static void iommufd_test_dma_buf_release(struct dma_buf *dmabuf) +{ + struct iommufd_test_dma_buf *priv = dmabuf->priv; + + kfree(priv->memory); + kfree(priv); +} + +static const struct dma_buf_ops iommufd_test_dmabuf_ops = { + .attach = iommufd_test_dma_buf_attach, + .detach = iommufd_test_dma_buf_detach, + .map_dma_buf = iommufd_test_dma_buf_map, + .release = iommufd_test_dma_buf_release, + .unmap_dma_buf = iommufd_test_dma_buf_unmap, +}; + +int iommufd_test_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct iommufd_test_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &iommufd_test_dmabuf_ops) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + phys->paddr = virt_to_phys(priv->memory); + phys->len = priv->length; + return 0; +} + +static int iommufd_test_dmabuf_get(struct iommufd_ucmd *ucmd, + unsigned int open_flags, + size_t len) +{ + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc; + + len = ALIGN(len, PAGE_SIZE); + if (len == 0 || len > PAGE_SIZE * 512) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->length = len; + priv->memory = kzalloc(len, GFP_KERNEL); + if (!priv->memory) { + rc = -ENOMEM; + goto err_free; + } + + exp_info.ops = &iommufd_test_dmabuf_ops; + exp_info.size = len; + exp_info.flags = open_flags; + exp_info.priv = priv; + + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) { + rc = PTR_ERR(dmabuf); + goto err_free; + } + + return dma_buf_fd(dmabuf, open_flags); + +err_free: + kfree(priv->memory); + kfree(priv); + return rc; +} + +static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, + bool revoked) +{ + struct iommufd_test_dma_buf *priv; + struct dma_buf *dmabuf; + int rc = 0; + + dmabuf = dma_buf_get(fd); + if (IS_ERR(dmabuf)) + return PTR_ERR(dmabuf); + + if (dmabuf->ops != &iommufd_test_dmabuf_ops) { + rc = -EOPNOTSUPP; + goto err_put; + } + + priv = dmabuf->priv; + dma_resv_lock(dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(dmabuf); + dma_resv_unlock(dmabuf->resv); + +err_put: + dma_buf_put(dmabuf); + return rc; +} + int iommufd_test(struct iommufd_ucmd *ucmd) { struct iommu_test_cmd *cmd = ucmd->cmd; @@ -2115,6 +2251,13 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_pasid_detach(ucmd, cmd); case IOMMU_TEST_OP_PASID_CHECK_HWPT: return iommufd_test_pasid_check_hwpt(ucmd, cmd); + case IOMMU_TEST_OP_DMABUF_GET: + return iommufd_test_dmabuf_get(ucmd, cmd->dmabuf_get.open_flags, + cmd->dmabuf_get.length); + case IOMMU_TEST_OP_DMABUF_REVOKE: + return iommufd_test_dmabuf_revoke(ucmd, + cmd->dmabuf_revoke.dmabuf_fd, + cmd->dmabuf_revoke.revoked); default: return -EOPNOTSUPP; } diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 49e395eb013d3..fd15973bf2683 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,12 +25,12 @@ struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct p2pdma_provider mem[PCI_STD_NUM_BARS]; }; struct pci_p2pdma_pagemap { - struct pci_dev *provider; - u64 bus_offset; struct dev_pagemap pgmap; + struct p2pdma_provider *mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -212,8 +212,8 @@ static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ - struct pci_p2pdma *p2pdma = - rcu_dereference_protected(pgmap->provider->p2pdma, 1); + struct pci_p2pdma *p2pdma = rcu_dereference_protected( + to_pci_dev(pgmap->mem->owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -236,56 +236,136 @@ static void pci_p2pdma_release(void *data) /* Flush and disable pci_alloc_p2p_mem() */ pdev->p2pdma = NULL; - synchronize_rcu(); + if (p2pdma->pool) + synchronize_rcu(); + xa_destroy(&p2pdma->map_types); + + if (!p2pdma->pool) + return; gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); - xa_destroy(&p2pdma->map_types); } -static int pci_p2pdma_setup(struct pci_dev *pdev) +/** + * pcim_p2pdma_init - Initialise peer-to-peer DMA providers + * @pdev: The PCI device to enable P2PDMA for + * + * This function initializes the peer-to-peer DMA infrastructure + * for a PCI device. It allocates and sets up the necessary data + * structures to support P2PDMA operations, including mapping type + * tracking. + */ +int pcim_p2pdma_init(struct pci_dev *pdev) { - int error = -ENOMEM; struct pci_p2pdma *p2p; + int i, ret; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2p) + return 0; p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); if (!p2p) return -ENOMEM; xa_init(&p2p->map_types); + /* + * Iterate over all standard PCI BARs and record only those that + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O + * port BARs) since they cannot be used for peer-to-peer (P2P) + * transactions. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); - if (!p2p->pool) - goto out; + p2p->mem[i].owner = &pdev->dev; + p2p->mem[i].bus_offset = + pci_bus_address(pdev, i) - pci_resource_start(pdev, i); + } - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); - if (error) - goto out_pool_destroy; + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (ret) + goto out_p2p; - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); - if (error) + rcu_assign_pointer(pdev->p2pdma, p2p); + return 0; + +out_p2p: + devm_kfree(&pdev->dev, p2p); + return ret; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_init); + +/** + * pcim_p2pdma_provider - Get peer-to-peer DMA provider + * @pdev: The PCI device to enable P2PDMA for + * @bar: BAR index to get provider + * + * This function gets peer-to-peer DMA provider for a PCI device. The lifetime + * of the provider (and of course the MMIO) is bound to the lifetime of the + * driver. A driver calling this function must ensure that all references to the + * provider, and any DMA mappings created for any MMIO, are all cleaned up + * before the driver remove() completes. + * + * Since P2P is almost always shared with a second driver this means some system + * to notify, invalidate and revoke the MMIO's DMA must be in place to use this + * function. For example a revoke can be built using DMABUF. + */ +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar) +{ + struct pci_p2pdma *p2p; + + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + return NULL; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (WARN_ON(!p2p)) + /* Someone forgot to call to pcim_p2pdma_init() before */ + return NULL; + + return &p2p->mem[bar]; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_provider); + +static int pci_p2pdma_setup_pool(struct pci_dev *pdev) +{ + struct pci_p2pdma *p2pdma; + int ret; + + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2pdma->pool) + /* We already setup pools, do nothing, */ + return 0; + + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); + if (!p2pdma->pool) + return -ENOMEM; + + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); + if (ret) goto out_pool_destroy; - rcu_assign_pointer(pdev->p2pdma, p2p); return 0; out_pool_destroy: - gen_pool_destroy(p2p->pool); -out: - devm_kfree(&pdev->dev, p2p); - return error; + gen_pool_destroy(p2pdma->pool); + p2pdma->pool = NULL; + return ret; } static void pci_p2pdma_unmap_mappings(void *data) { - struct pci_dev *pdev = data; + struct pci_p2pdma_pagemap *p2p_pgmap = data; /* * Removing the alloc attribute from sysfs will call * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj, + &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -303,6 +383,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { struct pci_p2pdma_pagemap *p2p_pgmap; + struct p2pdma_provider *mem; struct dev_pagemap *pgmap; struct pci_p2pdma *p2pdma; void *addr; @@ -320,11 +401,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (size + offset > pci_resource_len(pdev, bar)) return -EINVAL; - if (!pdev->p2pdma) { - error = pci_p2pdma_setup(pdev); - if (error) - return error; - } + error = pcim_p2pdma_init(pdev); + if (error) + return error; + + error = pci_p2pdma_setup_pool(pdev); + if (error) + return error; + + mem = pcim_p2pdma_provider(pdev, bar); + /* + * We checked validity of BAR prior to call + * to pcim_p2pdma_provider. It should never return NULL. + */ + if (WARN_ON(!mem)) + return -EINVAL; p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); if (!p2p_pgmap) @@ -336,10 +427,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - - p2p_pgmap->provider = pdev; - p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - - pci_resource_start(pdev, bar); + p2p_pgmap->mem = mem; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -348,7 +436,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, } error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, - pdev); + p2p_pgmap); if (error) goto pages_free; @@ -980,16 +1068,26 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, - struct device *dev) +/** + * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers + * @provider: P2PDMA provider structure + * @dev: Target device for the transfer + * + * Determines how peer-to-peer DMA transfers should be mapped between + * the provider and the target device. The mapping type indicates whether + * the transfer can be done directly through PCI switches or must go + * through the host bridge. + */ +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; - struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; + struct pci_dev *pdev = to_pci_dev(provider->owner); struct pci_dev *client; struct pci_p2pdma *p2pdma; int dist; - if (!provider->p2pdma) + if (!pdev->p2pdma) return PCI_P2PDMA_MAP_NOT_SUPPORTED; if (!dev_is_pci(dev)) @@ -998,7 +1096,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, client = to_pci_dev(dev); rcu_read_lock(); - p2pdma = rcu_dereference(provider->p2pdma); + p2pdma = rcu_dereference(pdev->p2pdma); if (p2pdma) type = xa_to_value(xa_load(&p2pdma->map_types, @@ -1006,7 +1104,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, rcu_read_unlock(); if (type == PCI_P2PDMA_MAP_UNKNOWN) - return calc_map_type_and_dist(provider, client, &dist, true); + return calc_map_type_and_dist(pdev, client, &dist, true); return type; } @@ -1014,9 +1112,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { - state->pgmap = page_pgmap(page); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; + struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); + + if (state->mem == p2p_pgmap->mem) + return; + + state->mem = p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev); } /** diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig new file mode 100644 index 0000000000000..672abea3b03cc --- /dev/null +++ b/drivers/resctrl/Kconfig @@ -0,0 +1,31 @@ +menuconfig ARM64_MPAM_DRIVER + bool "MPAM driver" + depends on ARM64 && ARM64_MPAM + select ACPI_MPAM if ACPI + help + Memory System Resource Partitioning and Monitoring (MPAM) driver for + System IP, e.g. caches and memory controllers. + +if ARM64_MPAM_DRIVER + +config ARM64_MPAM_DRIVER_DEBUG + bool "Enable debug messages from the MPAM driver" + help + Say yes here to enable debug messages from the MPAM driver. + +config MPAM_KUNIT_TEST + bool "KUnit tests for MPAM driver " if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Enable this option to run tests in the MPAM driver. + + If unsure, say N. + +endif + +config ARM64_MPAM_RESCTRL_FS + bool + default y if ARM64_MPAM_DRIVER && RESCTRL_FS + select RESCTRL_RMID_DEPENDS_ON_CLOSID + select RESCTRL_ASSIGN_FIXED diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile new file mode 100644 index 0000000000000..4f6d0e81f9b8f --- /dev/null +++ b/drivers/resctrl/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o +mpam-y += mpam_devices.o +mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o + +ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c new file mode 100644 index 0000000000000..f54882e4c039f --- /dev/null +++ b/drivers/resctrl/mpam_devices.c @@ -0,0 +1,2952 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpam_internal.h" + +/* Values for the T241 errata workaround */ +#define T241_CHIPS_MAX 4 +#define T241_CHIP_NSLICES 12 +#define T241_SPARE_REG0_OFF 0x1b0000 +#define T241_SPARE_REG1_OFF 0x1c0000 +#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys) +#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8) +#define SMCCC_SOC_ID_T241 0x036b0241 +static void __iomem *t241_scratch_regs[T241_CHIPS_MAX]; + +/* + * mpam_list_lock protects the SRCU lists when writing. Once the + * mpam_enabled key is enabled these lists are read-only, + * unless the error interrupt disables the driver. + */ +static DEFINE_MUTEX(mpam_list_lock); +static LIST_HEAD(mpam_all_msc); + +struct srcu_struct mpam_srcu; + +/* + * Number of MSCs that have been probed. Once all MSCs have been probed MPAM + * can be enabled. + */ +static atomic_t mpam_num_msc; + +static int mpam_cpuhp_state; +static DEFINE_MUTEX(mpam_cpuhp_state_lock); + +/* + * The smallest common values for any CPU or MSC in the system. + * Generating traffic outside this range will result in screaming interrupts. + */ +u16 mpam_partid_max; +u8 mpam_pmg_max; +static bool partid_max_init, partid_max_published; +static DEFINE_SPINLOCK(partid_max_lock); + +/* + * mpam is enabled once all devices have been probed from CPU online callbacks, + * scheduled via this work_struct. If access to an MSC depends on a CPU that + * was not brought online at boot, this can happen surprisingly late. + */ +static DECLARE_WORK(mpam_enable_work, &mpam_enable); + +/* + * All mpam error interrupts indicate a software bug. On receipt, disable the + * driver. + */ +static DECLARE_WORK(mpam_broken_work, &mpam_disable); + +/* When mpam is disabled, the printed reason to aid debugging */ +static char *mpam_disable_reason; + +/* + * Whether resctrl has been setup. Used by cpuhp in preference to + * mpam_is_enabled(). The disable call after an error interrupt makes + * mpam_is_enabled() false before the cpuhp callbacks are made. + * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks). + */ +static bool mpam_resctrl_enabled; + +/* + * An MSC is a physical container for controls and monitors, each identified by + * their RIS index. These share a base-address, interrupts and some MMIO + * registers. A vMSC is a virtual container for RIS in an MSC that control or + * monitor the same thing. Members of a vMSC are all RIS in the same MSC, but + * not all RIS in an MSC share a vMSC. + * + * Components are a group of vMSC that control or monitor the same thing but + * are from different MSC, so have different base-address, interrupts etc. + * Classes are the set components of the same type. + * + * The features of a vMSC is the union of the RIS it contains. + * The features of a Class and Component are the common subset of the vMSC + * they contain. + * + * e.g. The system cache may have bandwidth controls on multiple interfaces, + * for regulating traffic from devices independently of traffic from CPUs. + * If these are two RIS in one MSC, they will be treated as controlling + * different things, and will not share a vMSC/component/class. + * + * e.g. The L2 may have one MSC and two RIS, one for cache-controls another + * for bandwidth. These two RIS are members of the same vMSC. + * + * e.g. The set of RIS that make up the L2 are grouped as a component. These + * are sometimes termed slices. They should be configured the same, as if there + * were only one. + * + * e.g. The SoC probably has more than one L2, each attached to a distinct set + * of CPUs. All the L2 components are grouped as a class. + * + * When creating an MSC, struct mpam_msc is added to the all mpam_all_msc list, + * then linked via struct mpam_ris to a vmsc, component and class. + * The same MSC may exist under different class->component->vmsc paths, but the + * RIS index will be unique. + */ +LIST_HEAD(mpam_classes); + +/* List of all objects that can be free()d after synchronise_srcu() */ +static LLIST_HEAD(mpam_garbage); + +static inline void init_garbage(struct mpam_garbage *garbage) +{ + init_llist_node(&garbage->llist); +} + +#define add_to_garbage(x) \ +do { \ + __typeof__(x) _x = (x); \ + _x->garbage.to_free = _x; \ + llist_add(&_x->garbage.llist, &mpam_garbage); \ +} while (0) + +static void mpam_free_garbage(void) +{ + struct mpam_garbage *iter, *tmp; + struct llist_node *to_free = llist_del_all(&mpam_garbage); + + if (!to_free) + return; + + synchronize_srcu(&mpam_srcu); + + llist_for_each_entry_safe(iter, tmp, to_free, llist) { + if (iter->pdev) + devm_kfree(&iter->pdev->dev, iter->to_free); + else + kfree(iter->to_free); + } +} + +/* + * Once mpam is enabled, new requestors cannot further reduce the available + * partid. Assert that the size is fixed, and new requestors will be turned + * away. + */ +static void mpam_assert_partid_sizes_fixed(void) +{ + WARN_ON_ONCE(!partid_max_published); +} + +static u32 __mpam_read_reg(struct mpam_msc *msc, u16 reg) +{ + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + return readl_relaxed(msc->mapped_hwpage + reg); +} + +static inline u32 _mpam_read_partsel_reg(struct mpam_msc *msc, u16 reg) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + return __mpam_read_reg(msc, reg); +} + +#define mpam_read_partsel_reg(msc, reg) _mpam_read_partsel_reg(msc, MPAMF_##reg) + +static void __mpam_write_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + WARN_ON_ONCE(reg + sizeof(u32) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + writel_relaxed(val, msc->mapped_hwpage + reg); +} + +static inline void _mpam_write_partsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + lockdep_assert_held_once(&msc->part_sel_lock); + __mpam_write_reg(msc, reg, val); +} + +#define mpam_write_partsel_reg(msc, reg, val) _mpam_write_partsel_reg(msc, MPAMCFG_##reg, val) + +static inline u32 _mpam_read_monsel_reg(struct mpam_msc *msc, u16 reg) +{ + mpam_mon_sel_lock_held(msc); + return __mpam_read_reg(msc, reg); +} + +#define mpam_read_monsel_reg(msc, reg) _mpam_read_monsel_reg(msc, MSMON_##reg) + +static inline void _mpam_write_monsel_reg(struct mpam_msc *msc, u16 reg, u32 val) +{ + mpam_mon_sel_lock_held(msc); + __mpam_write_reg(msc, reg, val); +} + +#define mpam_write_monsel_reg(msc, reg, val) _mpam_write_monsel_reg(msc, MSMON_##reg, val) + +static u64 mpam_msc_read_idr(struct mpam_msc *msc) +{ + u64 idr_high = 0, idr_low; + + lockdep_assert_held(&msc->part_sel_lock); + + idr_low = mpam_read_partsel_reg(msc, IDR); + if (FIELD_GET(MPAMF_IDR_EXT, idr_low)) + idr_high = mpam_read_partsel_reg(msc, IDR + 4); + + return (idr_high << 32) | idr_low; +} + +static void mpam_msc_clear_esr(struct mpam_msc *msc) +{ + u64 esr_low = __mpam_read_reg(msc, MPAMF_ESR); + + if (!esr_low) + return; + + /* + * Clearing the high/low bits of MPAMF_ESR can not be atomic. + * Clear the top half first, so that the pending error bits in the + * lower half prevent hardware from updating either half of the + * register. + */ + if (msc->has_extd_esr) + __mpam_write_reg(msc, MPAMF_ESR + 4, 0); + __mpam_write_reg(msc, MPAMF_ESR, 0); +} + +static u64 mpam_msc_read_esr(struct mpam_msc *msc) +{ + u64 esr_high = 0, esr_low; + + esr_low = __mpam_read_reg(msc, MPAMF_ESR); + if (msc->has_extd_esr) + esr_high = __mpam_read_reg(msc, MPAMF_ESR + 4); + + return (esr_high << 32) | esr_low; +} + +static void __mpam_part_sel_raw(u32 partsel, struct mpam_msc *msc) +{ + lockdep_assert_held(&msc->part_sel_lock); + + mpam_write_partsel_reg(msc, PART_SEL, partsel); +} + +static void __mpam_part_sel(u8 ris_idx, u16 partid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, partid); + + __mpam_part_sel_raw(partsel, msc); +} + +static void __mpam_intpart_sel(u8 ris_idx, u16 intpartid, struct mpam_msc *msc) +{ + u32 partsel = FIELD_PREP(MPAMCFG_PART_SEL_RIS, ris_idx) | + FIELD_PREP(MPAMCFG_PART_SEL_PARTID_SEL, intpartid) | + MPAMCFG_PART_SEL_INTERNAL; + + __mpam_part_sel_raw(partsel, msc); +} + +int mpam_register_requestor(u16 partid_max, u8 pmg_max) +{ + guard(spinlock)(&partid_max_lock); + if (!partid_max_init) { + mpam_partid_max = partid_max; + mpam_pmg_max = pmg_max; + partid_max_init = true; + } else if (!partid_max_published) { + mpam_partid_max = min(mpam_partid_max, partid_max); + mpam_pmg_max = min(mpam_pmg_max, pmg_max); + } else { + /* New requestors can't lower the values */ + if (partid_max < mpam_partid_max || pmg_max < mpam_pmg_max) + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL(mpam_register_requestor); + +static struct mpam_class * +mpam_class_alloc(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + class = kzalloc(sizeof(*class), GFP_KERNEL); + if (!class) + return ERR_PTR(-ENOMEM); + init_garbage(&class->garbage); + + INIT_LIST_HEAD_RCU(&class->components); + /* Affinity is updated when ris are added */ + class->level = level_idx; + class->type = type; + INIT_LIST_HEAD_RCU(&class->classes_list); + ida_init(&class->ida_csu_mon); + ida_init(&class->ida_mbwu_mon); + + list_add_rcu(&class->classes_list, &mpam_classes); + + return class; +} + +static void mpam_class_destroy(struct mpam_class *class) +{ + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&class->classes_list); + add_to_garbage(class); +} + +static struct mpam_class * +mpam_class_find(u8 level_idx, enum mpam_class_types type) +{ + struct mpam_class *class; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + if (class->type == type && class->level == level_idx) + return class; + } + + return mpam_class_alloc(level_idx, type); +} + +static struct mpam_component * +mpam_component_alloc(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + comp = kzalloc(sizeof(*comp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + init_garbage(&comp->garbage); + + comp->comp_id = id; + INIT_LIST_HEAD_RCU(&comp->vmsc); + /* Affinity is updated when RIS are added */ + INIT_LIST_HEAD_RCU(&comp->class_list); + comp->class = class; + + list_add_rcu(&comp->class_list, &class->components); + + return comp; +} + +static void __destroy_component_cfg(struct mpam_component *comp); + +static void mpam_component_destroy(struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + __destroy_component_cfg(comp); + + list_del_rcu(&comp->class_list); + add_to_garbage(comp); + + if (list_empty(&class->components)) + mpam_class_destroy(class); +} + +static struct mpam_component * +mpam_component_find(struct mpam_class *class, int id) +{ + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(comp, &class->components, class_list) { + if (comp->comp_id == id) + return comp; + } + + return mpam_component_alloc(class, id); +} + +static struct mpam_vmsc * +mpam_vmsc_alloc(struct mpam_component *comp, struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + vmsc = kzalloc(sizeof(*vmsc), GFP_KERNEL); + if (!vmsc) + return ERR_PTR(-ENOMEM); + init_garbage(&vmsc->garbage); + + INIT_LIST_HEAD_RCU(&vmsc->ris); + INIT_LIST_HEAD_RCU(&vmsc->comp_list); + vmsc->comp = comp; + vmsc->msc = msc; + + list_add_rcu(&vmsc->comp_list, &comp->vmsc); + + return vmsc; +} + +static void mpam_vmsc_destroy(struct mpam_vmsc *vmsc) +{ + struct mpam_component *comp = vmsc->comp; + + lockdep_assert_held(&mpam_list_lock); + + list_del_rcu(&vmsc->comp_list); + add_to_garbage(vmsc); + + if (list_empty(&comp->vmsc)) + mpam_component_destroy(comp); +} + +static struct mpam_vmsc * +mpam_vmsc_find(struct mpam_component *comp, struct mpam_msc *msc) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + if (vmsc->msc->id == msc->id) + return vmsc; + } + + return mpam_vmsc_alloc(comp, msc); +} + +/* + * The cacheinfo structures are only populated when CPUs are online. + * This helper walks the acpi tables to include offline CPUs too. + */ +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity) +{ + return acpi_pptt_get_cpumask_from_cache_id(cache_id, affinity); +} + +/* + * cpumask_of_node() only knows about online CPUs. This can't tell us whether + * a class is represented on all possible CPUs. + */ +static void get_cpumask_from_node_id(u32 node_id, cpumask_t *affinity) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (node_id == cpu_to_node(cpu)) + cpumask_set_cpu(cpu, affinity); + } +} + +static int mpam_ris_get_affinity(struct mpam_msc *msc, cpumask_t *affinity, + enum mpam_class_types type, + struct mpam_class *class, + struct mpam_component *comp) +{ + int err; + + switch (type) { + case MPAM_CLASS_CACHE: + err = mpam_get_cpumask_from_cache_id(comp->comp_id, class->level, + affinity); + if (err) { + dev_warn_once(&msc->pdev->dev, + "Failed to determine CPU affinity\n"); + return err; + } + + if (cpumask_empty(affinity)) + dev_warn_once(&msc->pdev->dev, "no CPUs associated with cache node\n"); + + break; + case MPAM_CLASS_MEMORY: + get_cpumask_from_node_id(comp->comp_id, affinity); + /* affinity may be empty for CPU-less memory nodes */ + break; + case MPAM_CLASS_UNKNOWN: + return 0; + } + + cpumask_and(affinity, affinity, &msc->accessibility); + + return 0; +} + +static int mpam_ris_create_locked(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + int err; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class; + struct mpam_component *comp; + struct platform_device *pdev = msc->pdev; + + lockdep_assert_held(&mpam_list_lock); + + if (ris_idx > MPAM_MSC_MAX_NUM_RIS) + return -EINVAL; + + if (test_and_set_bit(ris_idx, &msc->ris_idxs)) + return -EBUSY; + + ris = devm_kzalloc(&msc->pdev->dev, sizeof(*ris), GFP_KERNEL); + if (!ris) + return -ENOMEM; + init_garbage(&ris->garbage); + ris->garbage.pdev = pdev; + + class = mpam_class_find(class_id, type); + if (IS_ERR(class)) + return PTR_ERR(class); + + comp = mpam_component_find(class, component_id); + if (IS_ERR(comp)) { + if (list_empty(&class->components)) + mpam_class_destroy(class); + return PTR_ERR(comp); + } + + vmsc = mpam_vmsc_find(comp, msc); + if (IS_ERR(vmsc)) { + if (list_empty(&comp->vmsc)) + mpam_component_destroy(comp); + return PTR_ERR(vmsc); + } + + err = mpam_ris_get_affinity(msc, &ris->affinity, type, class, comp); + if (err) { + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); + return err; + } + + ris->ris_idx = ris_idx; + INIT_LIST_HEAD_RCU(&ris->msc_list); + INIT_LIST_HEAD_RCU(&ris->vmsc_list); + ris->vmsc = vmsc; + + cpumask_or(&comp->affinity, &comp->affinity, &ris->affinity); + cpumask_or(&class->affinity, &class->affinity, &ris->affinity); + list_add_rcu(&ris->vmsc_list, &vmsc->ris); + list_add_rcu(&ris->msc_list, &msc->ris); + + return 0; +} + +static void mpam_ris_destroy(struct mpam_msc_ris *ris) +{ + struct mpam_vmsc *vmsc = ris->vmsc; + struct mpam_msc *msc = vmsc->msc; + struct mpam_component *comp = vmsc->comp; + struct mpam_class *class = comp->class; + + lockdep_assert_held(&mpam_list_lock); + + /* + * It is assumed affinities don't overlap. If they do the class becomes + * unusable immediately. + */ + cpumask_andnot(&class->affinity, &class->affinity, &ris->affinity); + cpumask_andnot(&comp->affinity, &comp->affinity, &ris->affinity); + clear_bit(ris->ris_idx, &msc->ris_idxs); + list_del_rcu(&ris->msc_list); + list_del_rcu(&ris->vmsc_list); + add_to_garbage(ris); + + if (list_empty(&vmsc->ris)) + mpam_vmsc_destroy(vmsc); +} + +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id) +{ + int err; + + mutex_lock(&mpam_list_lock); + err = mpam_ris_create_locked(msc, ris_idx, type, class_id, + component_id); + mutex_unlock(&mpam_list_lock); + if (err) + mpam_free_garbage(); + + return err; +} + +static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc, + u8 ris_idx) +{ + int err; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + + if (!test_bit(ris_idx, &msc->ris_idxs)) { + err = mpam_ris_create_locked(msc, ris_idx, MPAM_CLASS_UNKNOWN, + 0, 0); + if (err) + return ERR_PTR(err); + } + + list_for_each_entry(ris, &msc->ris, msc_list) { + if (ris->ris_idx == ris_idx) + return ris; + } + + return ERR_PTR(-ENOENT); +} + +static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc, + const struct mpam_quirk *quirk) +{ + s32 soc_id = arm_smccc_get_soc_id_version(); + struct resource *r; + phys_addr_t phys; + + /* + * A mapping to a device other than the MSC is needed, check + * SOC_ID is NVIDIA T241 chip (036b:0241) + */ + if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241) + return -EINVAL; + + r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0); + if (!r) + return -EINVAL; + + /* Find the internal registers base addr from the CHIP ID */ + msc->t241_id = T241_CHIP_ID(r->start); + phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL; + + t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M); + if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id])) + return -EINVAL; + + pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n"); + + return 0; +} + +static const struct mpam_quirk mpam_quirks[] = { + { + /* NVIDIA t241 erratum T241-MPAM-1 */ + .init = mpam_enable_quirk_nvidia_t241_1, + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_SCRUB_SHADOW_REGS, + }, + { + /* NVIDIA t241 erratum T241-MPAM-4 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_FORCE_MBW_MIN_TO_ONE, + }, + { + /* NVIDIA t241 erratum T241-MPAM-6 */ + .iidr = MPAM_IIDR_NVIDIA_T241, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = T241_MBW_COUNTER_SCALE_64, + }, + { + /* ARM CMN-650 CSU erratum 3642720 */ + .iidr = MPAM_IIDR_ARM_CMN_650, + .iidr_mask = MPAM_IIDR_MATCH_ONE, + .workaround = IGNORE_CSU_NRDY, + }, + { NULL } /* Sentinel */ +}; + +static void mpam_enable_quirks(struct mpam_msc *msc) +{ + const struct mpam_quirk *quirk; + + for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) { + int err = 0; + + if (quirk->iidr != (msc->iidr & quirk->iidr_mask)) + continue; + + if (quirk->init) + err = quirk->init(msc, quirk); + + if (err) + continue; + + mpam_set_quirk(quirk->workaround, msc); + } +} + +/* + * IHI009A.a has this nugget: "If a monitor does not support automatic behaviour + * of NRDY, software can use this bit for any purpose" - so hardware might not + * implement this - but it isn't RES0. + * + * Try and see what values stick in this bit. If we can write either value, + * its probably not implemented by hardware. + */ +static bool _mpam_ris_hw_probe_hw_nrdy(struct mpam_msc_ris *ris, u32 mon_reg) +{ + u32 now; + u64 mon_sel; + bool can_set, can_clear; + struct mpam_msc *msc = ris->vmsc->msc; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return false; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, 0) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + _mpam_write_monsel_reg(msc, mon_reg, mon_sel); + + _mpam_write_monsel_reg(msc, mon_reg, MSMON___NRDY); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_set = now & MSMON___NRDY; + + _mpam_write_monsel_reg(msc, mon_reg, 0); + now = _mpam_read_monsel_reg(msc, mon_reg); + can_clear = !(now & MSMON___NRDY); + mpam_mon_sel_unlock(msc); + + return (!can_set || !can_clear); +} + +#define mpam_ris_hw_probe_hw_nrdy(_ris, _mon_reg) \ + _mpam_ris_hw_probe_hw_nrdy(_ris, MSMON_##_mon_reg) + +static void mpam_ris_hw_probe(struct mpam_msc_ris *ris) +{ + int err; + struct mpam_msc *msc = ris->vmsc->msc; + struct device *dev = &msc->pdev->dev; + struct mpam_props *props = &ris->props; + struct mpam_class *class = ris->vmsc->comp->class; + + lockdep_assert_held(&msc->probe_lock); + lockdep_assert_held(&msc->part_sel_lock); + + /* Cache Capacity Partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CCAP_PART, ris->idr)) { + u32 ccap_features = mpam_read_partsel_reg(msc, CCAP_IDR); + + props->cmax_wd = FIELD_GET(MPAMF_CCAP_IDR_CMAX_WD, ccap_features); + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM, ccap_features)) + mpam_set_feature(mpam_feat_cmax_softlim, props); + + if (props->cmax_wd && + !FIELD_GET(MPAMF_CCAP_IDR_NO_CMAX, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmax, props); + + if (props->cmax_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CMIN, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cmin, props); + + props->cassoc_wd = FIELD_GET(MPAMF_CCAP_IDR_CASSOC_WD, ccap_features); + if (props->cassoc_wd && + FIELD_GET(MPAMF_CCAP_IDR_HAS_CASSOC, ccap_features)) + mpam_set_feature(mpam_feat_cmax_cassoc, props); + } + + /* Cache Portion partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_CPOR_PART, ris->idr)) { + u32 cpor_features = mpam_read_partsel_reg(msc, CPOR_IDR); + + props->cpbm_wd = FIELD_GET(MPAMF_CPOR_IDR_CPBM_WD, cpor_features); + if (props->cpbm_wd) + mpam_set_feature(mpam_feat_cpor_part, props); + } + + /* Memory bandwidth partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_MBW_PART, ris->idr)) { + u32 mbw_features = mpam_read_partsel_reg(msc, MBW_IDR); + + /* portion bitmap resolution */ + props->mbw_pbm_bits = FIELD_GET(MPAMF_MBW_IDR_BWPBM_WD, mbw_features); + if (props->mbw_pbm_bits && + FIELD_GET(MPAMF_MBW_IDR_HAS_PBM, mbw_features)) + mpam_set_feature(mpam_feat_mbw_part, props); + + props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features); + + /* + * The BWA_WD field can represent 0-63, but the control fields it + * describes have a maximum of 16 bits. + */ + props->bwa_wd = min(props->bwa_wd, 16); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features)) + mpam_set_feature(mpam_feat_mbw_max, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MIN, mbw_features)) + mpam_set_feature(mpam_feat_mbw_min, props); + + if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_PROP, mbw_features)) + mpam_set_feature(mpam_feat_mbw_prop, props); + } + + /* Priority partitioning */ + if (FIELD_GET(MPAMF_IDR_HAS_PRI_PART, ris->idr)) { + u32 pri_features = mpam_read_partsel_reg(msc, PRI_IDR); + + props->intpri_wd = FIELD_GET(MPAMF_PRI_IDR_INTPRI_WD, pri_features); + if (props->intpri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_INTPRI, pri_features)) { + mpam_set_feature(mpam_feat_intpri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_INTPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_intpri_part_0_low, props); + } + + props->dspri_wd = FIELD_GET(MPAMF_PRI_IDR_DSPRI_WD, pri_features); + if (props->dspri_wd && FIELD_GET(MPAMF_PRI_IDR_HAS_DSPRI, pri_features)) { + mpam_set_feature(mpam_feat_dspri_part, props); + if (FIELD_GET(MPAMF_PRI_IDR_DSPRI_0_IS_LOW, pri_features)) + mpam_set_feature(mpam_feat_dspri_part_0_low, props); + } + } + + /* Performance Monitoring */ + if (FIELD_GET(MPAMF_IDR_HAS_MSMON, ris->idr)) { + u32 msmon_features = mpam_read_partsel_reg(msc, MSMON_IDR); + + /* + * If the firmware max-nrdy-us property is missing, the + * CSU counters can't be used. Should we wait forever? + */ + err = device_property_read_u32(&msc->pdev->dev, + "arm,not-ready-us", + &msc->nrdy_usec); + + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_CSU, msmon_features)) { + u32 csumonidr; + + csumonidr = mpam_read_partsel_reg(msc, CSUMON_IDR); + props->num_csu_mon = FIELD_GET(MPAMF_CSUMON_IDR_NUM_MON, csumonidr); + if (props->num_csu_mon) { + bool hw_managed; + + mpam_set_feature(mpam_feat_msmon_csu, props); + + if (FIELD_GET(MPAMF_CSUMON_IDR_HAS_XCL, csumonidr)) + mpam_set_feature(mpam_feat_msmon_csu_xcl, props); + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, CSU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_csu_hw_nrdy, props); + } + + /* + * Accept the missing firmware property if NRDY appears + * un-implemented. + */ + if (err && mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, props)) + dev_err_once(dev, "Counters are not usable because not-ready timeout was not provided by firmware."); + } + if (FIELD_GET(MPAMF_MSMON_IDR_MSMON_MBWU, msmon_features)) { + bool has_long, hw_managed; + u32 mbwumon_idr = mpam_read_partsel_reg(msc, MBWUMON_IDR); + + props->num_mbwu_mon = FIELD_GET(MPAMF_MBWUMON_IDR_NUM_MON, mbwumon_idr); + if (props->num_mbwu_mon) { + mpam_set_feature(mpam_feat_msmon_mbwu, props); + + if (FIELD_GET(MPAMF_MBWUMON_IDR_HAS_RWBW, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_rwbw, props); + + has_long = FIELD_GET(MPAMF_MBWUMON_IDR_HAS_LONG, mbwumon_idr); + if (has_long) { + if (FIELD_GET(MPAMF_MBWUMON_IDR_LWD, mbwumon_idr)) + mpam_set_feature(mpam_feat_msmon_mbwu_63counter, props); + else + mpam_set_feature(mpam_feat_msmon_mbwu_44counter, props); + } else { + mpam_set_feature(mpam_feat_msmon_mbwu_31counter, props); + } + + /* Is NRDY hardware managed? */ + hw_managed = mpam_ris_hw_probe_hw_nrdy(ris, MBWU); + if (hw_managed) + mpam_set_feature(mpam_feat_msmon_mbwu_hw_nrdy, props); + + /* + * Don't warn about any missing firmware property for + * MBWU NRDY - it doesn't make any sense! + */ + } + } + } + + /* + * RIS with PARTID narrowing don't have enough storage for one + * configuration per PARTID. If these are in a class we could use, + * reduce the supported partid_max to match the number of intpartid. + * If the class is unknown, just ignore it. + */ + if (FIELD_GET(MPAMF_IDR_HAS_PARTID_NRW, ris->idr) && + class->type != MPAM_CLASS_UNKNOWN) { + u32 nrwidr = mpam_read_partsel_reg(msc, PARTID_NRW_IDR); + u16 partid_max = FIELD_GET(MPAMF_PARTID_NRW_IDR_INTPARTID_MAX, nrwidr); + + mpam_set_feature(mpam_feat_partid_nrw, props); + msc->partid_max = min(msc->partid_max, partid_max); + } +} + +static int mpam_msc_hw_probe(struct mpam_msc *msc) +{ + u64 idr; + u16 partid_max; + u8 ris_idx, pmg_max; + struct mpam_msc_ris *ris; + struct device *dev = &msc->pdev->dev; + + lockdep_assert_held(&msc->probe_lock); + + idr = __mpam_read_reg(msc, MPAMF_AIDR); + if ((idr & MPAMF_AIDR_ARCH_MAJOR_REV) != MPAM_ARCHITECTURE_V1) { + dev_err_once(dev, "MSC does not match MPAM architecture v1.x\n"); + return -EIO; + } + + /* Grab an IDR value to find out how many RIS there are */ + mutex_lock(&msc->part_sel_lock); + idr = mpam_msc_read_idr(msc); + msc->iidr = mpam_read_partsel_reg(msc, IIDR); + mutex_unlock(&msc->part_sel_lock); + + mpam_enable_quirks(msc); + + msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr); + + /* Use these values so partid/pmg always starts with a valid value */ + msc->partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + msc->pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + + for (ris_idx = 0; ris_idx <= msc->ris_max; ris_idx++) { + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + idr = mpam_msc_read_idr(msc); + mutex_unlock(&msc->part_sel_lock); + + partid_max = FIELD_GET(MPAMF_IDR_PARTID_MAX, idr); + pmg_max = FIELD_GET(MPAMF_IDR_PMG_MAX, idr); + msc->partid_max = min(msc->partid_max, partid_max); + msc->pmg_max = min(msc->pmg_max, pmg_max); + msc->has_extd_esr = FIELD_GET(MPAMF_IDR_HAS_EXTD_ESR, idr); + + mutex_lock(&mpam_list_lock); + ris = mpam_get_or_create_ris(msc, ris_idx); + mutex_unlock(&mpam_list_lock); + if (IS_ERR(ris)) + return PTR_ERR(ris); + ris->idr = idr; + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris_idx, 0, msc); + mpam_ris_hw_probe(ris); + mutex_unlock(&msc->part_sel_lock); + } + + /* Clear any stale errors */ + mpam_msc_clear_esr(msc); + + spin_lock(&partid_max_lock); + mpam_partid_max = min(mpam_partid_max, msc->partid_max); + mpam_pmg_max = min(mpam_pmg_max, msc->pmg_max); + spin_unlock(&partid_max_lock); + + msc->probed = true; + + return 0; +} + +struct mon_read { + struct mpam_msc_ris *ris; + struct mon_cfg *ctx; + enum mpam_device_features type; + u64 *val; + int err; + bool waited_timeout; +}; + +static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris) +{ + return (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, &ris->props) || + mpam_has_feature(mpam_feat_msmon_mbwu_44counter, &ris->props)); +} + +static u64 mpam_msc_read_mbwu_l(struct mpam_msc *msc) +{ + int retry = 3; + u32 mbwu_l_low; + u64 mbwu_l_high1, mbwu_l_high2; + + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + do { + mbwu_l_high1 = mbwu_l_high2; + mbwu_l_low = __mpam_read_reg(msc, MSMON_MBWU_L); + mbwu_l_high2 = __mpam_read_reg(msc, MSMON_MBWU_L + 4); + + retry--; + } while (mbwu_l_high1 != mbwu_l_high2 && retry > 0); + + if (mbwu_l_high1 == mbwu_l_high2) + return (mbwu_l_high1 << 32) | mbwu_l_low; + + pr_warn("Failed to read a stable value\n"); + return MSMON___L_NRDY; +} + +static void mpam_msc_zero_mbwu_l(struct mpam_msc *msc) +{ + mpam_mon_sel_lock_held(msc); + + WARN_ON_ONCE((MSMON_MBWU_L + sizeof(u64)) > msc->mapped_hwpage_sz); + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &msc->accessibility)); + + __mpam_write_reg(msc, MSMON_MBWU_L, 0); + __mpam_write_reg(msc, MSMON_MBWU_L + 4, 0); +} + +static void gen_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mon_cfg *ctx = m->ctx; + + /* + * For CSU counters its implementation-defined what happens when not + * filtering by partid. + */ + *ctl_val = MSMON_CFG_x_CTL_MATCH_PARTID; + + *flt_val = FIELD_PREP(MSMON_CFG_x_FLT_PARTID, ctx->partid); + + if (m->ctx->match_pmg) { + *ctl_val |= MSMON_CFG_x_CTL_MATCH_PMG; + *flt_val |= FIELD_PREP(MSMON_CFG_x_FLT_PMG, ctx->pmg); + } + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val |= MSMON_CFG_CSU_CTL_TYPE_CSU; + + if (mpam_has_feature(mpam_feat_msmon_csu_xcl, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_CSU_FLT_XCL, ctx->csu_exclude_clean); + + break; + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + *ctl_val |= MSMON_CFG_MBWU_CTL_TYPE_MBWU; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_rwbw, &m->ris->props)) + *flt_val |= FIELD_PREP(MSMON_CFG_MBWU_FLT_RWBW, ctx->opts); + + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +static void read_msmon_ctl_flt_vals(struct mon_read *m, u32 *ctl_val, + u32 *flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + switch (m->type) { + case mpam_feat_msmon_csu: + *ctl_val = mpam_read_monsel_reg(msc, CFG_CSU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_CSU_FLT); + break; + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + *ctl_val = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + *flt_val = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +/* Remove values set by the hardware to prevent apparent mismatches. */ +static inline void clean_msmon_ctl_val(u32 *cur_ctl) +{ + *cur_ctl &= ~MSMON_CFG_x_CTL_OFLOW_STATUS; + + if (FIELD_GET(MSMON_CFG_x_CTL_TYPE, *cur_ctl) == MSMON_CFG_MBWU_CTL_TYPE_MBWU) + *cur_ctl &= ~MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L; +} + +static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val, + u32 flt_val) +{ + struct mpam_msc *msc = m->ris->vmsc->msc; + + /* + * Write the ctl_val with the enable bit cleared, reset the counter, + * then enable counter. + */ + switch (m->type) { + case mpam_feat_msmon_csu: + mpam_write_monsel_reg(msc, CFG_CSU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val); + mpam_write_monsel_reg(msc, CSU, 0); + mpam_write_monsel_reg(msc, CFG_CSU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + break; + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + mpam_write_monsel_reg(msc, CFG_MBWU_FLT, flt_val); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, ctl_val | MSMON_CFG_x_CTL_EN); + /* Counting monitors require NRDY to be reset by software */ + if (m->type == mpam_feat_msmon_mbwu_31counter) + mpam_write_monsel_reg(msc, MBWU, 0); + else + mpam_msc_zero_mbwu_l(m->ris->vmsc->msc); + break; + default: + pr_warn("Unexpected monitor type %d\n", m->type); + } +} + +static u64 __mpam_msmon_overflow_val(enum mpam_device_features type) +{ + /* TODO: implement scaling counters */ + switch (type) { + case mpam_feat_msmon_mbwu_63counter: + return BIT_ULL(hweight_long(MSMON___LWD_VALUE)); + case mpam_feat_msmon_mbwu_44counter: + return BIT_ULL(hweight_long(MSMON___L_VALUE)); + case mpam_feat_msmon_mbwu_31counter: + return BIT_ULL(hweight_long(MSMON___VALUE)); + default: + return 0; + } +} + +static u64 mpam_msmon_overflow_val(enum mpam_device_features type, + struct mpam_msc *msc) +{ + u64 overflow_val = __mpam_msmon_overflow_val(type); + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + type != mpam_feat_msmon_mbwu_63counter) + overflow_val *= 64; + + return overflow_val; +} + +static void __ris_msmon_read(void *arg) +{ + u64 now; + bool nrdy = false; + bool config_mismatch; + bool overflow; + struct mon_read *m = arg; + struct mon_cfg *ctx = m->ctx; + bool reset_on_next_read = false; + struct mpam_msc_ris *ris = m->ris; + struct msmon_mbwu_state *mbwu_state; + struct mpam_props *rprops = &ris->props; + struct mpam_msc *msc = m->ris->vmsc->msc; + u32 mon_sel, ctl_val, flt_val, cur_ctl, cur_flt; + + if (!mpam_mon_sel_lock(msc)) { + m->err = -EIO; + return; + } + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, ctx->mon) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + switch (m->type) { + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + mbwu_state = &ris->mbwu_state[ctx->mon]; + if (mbwu_state) { + reset_on_next_read = mbwu_state->reset_on_next_read; + mbwu_state->reset_on_next_read = false; + } + break; + default: + break; + } + + /* + * Read the existing configuration to avoid re-writing the same values. + * This saves waiting for 'nrdy' on subsequent reads. + */ + read_msmon_ctl_flt_vals(m, &cur_ctl, &cur_flt); + + if (mpam_feat_msmon_mbwu_31counter == m->type) + overflow = cur_ctl & MSMON_CFG_x_CTL_OFLOW_STATUS; + else if (mpam_feat_msmon_mbwu_44counter == m->type || + mpam_feat_msmon_mbwu_63counter == m->type) + overflow = cur_ctl & MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L; + + clean_msmon_ctl_val(&cur_ctl); + gen_msmon_ctl_flt_vals(m, &ctl_val, &flt_val); + config_mismatch = cur_flt != flt_val || + cur_ctl != (ctl_val | MSMON_CFG_x_CTL_EN); + + if (config_mismatch || reset_on_next_read) { + write_msmon_ctl_flt_vals(m, ctl_val, flt_val); + overflow = false; + } else if (overflow) { + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, + cur_ctl & + ~(MSMON_CFG_x_CTL_OFLOW_STATUS | + MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L)); + } + + switch (m->type) { + case mpam_feat_msmon_csu: + now = mpam_read_monsel_reg(msc, CSU); + if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + + if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout) + nrdy = false; + + break; + case mpam_feat_msmon_mbwu_31counter: + case mpam_feat_msmon_mbwu_44counter: + case mpam_feat_msmon_mbwu_63counter: + if (m->type != mpam_feat_msmon_mbwu_31counter) { + now = mpam_msc_read_mbwu_l(msc); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___L_NRDY; + + if (m->type == mpam_feat_msmon_mbwu_63counter) + now = FIELD_GET(MSMON___LWD_VALUE, now); + else + now = FIELD_GET(MSMON___L_VALUE, now); + } else { + now = mpam_read_monsel_reg(msc, MBWU); + if (mpam_has_feature(mpam_feat_msmon_mbwu_hw_nrdy, rprops)) + nrdy = now & MSMON___NRDY; + now = FIELD_GET(MSMON___VALUE, now); + } + + if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) && + m->type != mpam_feat_msmon_mbwu_63counter) + now *= 64; + + if (nrdy) + break; + + mbwu_state = &ris->mbwu_state[ctx->mon]; + + if (overflow) + mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc); + + /* + * Include bandwidth consumed before the last hardware reset and + * a counter size increment for each overflow. + */ + now += mbwu_state->correction; + break; + default: + m->err = -EINVAL; + } + mpam_mon_sel_unlock(msc); + + if (nrdy) { + m->err = -EBUSY; + return; + } + + *m->val += now; +} + +static int _msmon_read(struct mpam_component *comp, struct mon_read *arg) +{ + int err, any_err = 0; + struct mpam_vmsc *vmsc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg->ris = ris; + + err = smp_call_function_any(&msc->accessibility, + __ris_msmon_read, arg, + true); + if (!err && arg->err) + err = arg->err; + + /* + * Save one error to be returned to the caller, but + * keep reading counters so that get reprogrammed. On + * platforms with NRDY this lets us wait once. + */ + if (err) + any_err = err; + } + } + + return any_err; +} + +static enum mpam_device_features mpam_msmon_choose_counter(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (mpam_has_feature(mpam_feat_msmon_mbwu_63counter, cprops)) + return mpam_feat_msmon_mbwu_63counter; + if (mpam_has_feature(mpam_feat_msmon_mbwu_44counter, cprops)) + return mpam_feat_msmon_mbwu_44counter; + + return mpam_feat_msmon_mbwu_31counter; +} + +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features type, u64 *val) +{ + int err; + struct mon_read arg; + u64 wait_jiffies = 0; + struct mpam_class *class = comp->class; + struct mpam_props *cprops = &class->props; + + might_sleep(); + + if (!mpam_is_enabled()) + return -EIO; + + if (!mpam_has_feature(type, cprops)) + return -EOPNOTSUPP; + + if (type == mpam_feat_msmon_mbwu) + type = mpam_msmon_choose_counter(class); + + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + if (err == -EBUSY && class->nrdy_usec) + wait_jiffies = usecs_to_jiffies(class->nrdy_usec); + + while (wait_jiffies) + wait_jiffies = schedule_timeout_uninterruptible(wait_jiffies); + + if (err == -EBUSY) { + arg = (struct mon_read) { + .ctx = ctx, + .type = type, + .val = val, + .waited_timeout = true, + }; + *val = 0; + + err = _msmon_read(comp, &arg); + } + + return err; +} + +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx) +{ + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + if (!mpam_is_enabled()) + return; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &vmsc->props)) + continue; + + msc = vmsc->msc; + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + continue; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + continue; + + ris->mbwu_state[ctx->mon].correction = 0; + ris->mbwu_state[ctx->mon].reset_on_next_read = true; + mpam_mon_sel_unlock(msc); + } + } +} + +static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd) +{ + u32 num_words, msb; + u32 bm = ~0; + int i; + + lockdep_assert_held(&msc->part_sel_lock); + + if (wd == 0) + return; + + /* + * Write all ~0 to all but the last 32bit-word, which may + * have fewer bits... + */ + num_words = DIV_ROUND_UP(wd, 32); + for (i = 0; i < num_words - 1; i++, reg += sizeof(bm)) + __mpam_write_reg(msc, reg, bm); + + /* + * ....and then the last (maybe) partial 32bit word. When wd is a + * multiple of 32, msb should be 31 to write a full 32bit word. + */ + msb = (wd - 1) % 32; + bm = GENMASK(msb, 0); + __mpam_write_reg(msc, reg, bm); +} + +static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid) +{ + int sidx, i, lcount = 1000; + void __iomem *regs; + u64 val0, val; + + regs = t241_scratch_regs[ris->vmsc->msc->t241_id]; + + for (i = 0; i < lcount; i++) { + /* Read the shadow register at index 0 */ + val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid)); + + /* Check if all the shadow registers have the same value */ + for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) { + val = readq_relaxed(regs + + T241_SHADOW_REG_OFF(sidx, partid)); + if (val != val0) + break; + } + if (sidx == T241_CHIP_NSLICES) + break; + } + + if (i == lcount) + pr_warn_once("t241: inconsistent values in shadow regs"); + + /* Write a value zero to spare registers to take effect of MBW conf */ + writeq_relaxed(0, regs + T241_SPARE_REG0_OFF); + writeq_relaxed(0, regs + T241_SPARE_REG1_OFF); +} + +static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc)) + mpam_apply_t241_erratum(ris, partid); +} + +static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props) +{ + u16 max_hw_value, min_hw_granule, res0_bits; + + res0_bits = 16 - props->bwa_wd; + max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits; + min_hw_granule = ~max_hw_value; + + return min_hw_granule + 1; +} + +static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props, + struct mpam_config *cfg) +{ + u16 val = 0; + u16 max; + u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1; + + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) { + max = cfg->mbw_max; + } else { + /* Resetting. Hence, use the ris specific default. */ + max = GENMASK(15, 16 - props->bwa_wd); + } + + if (max > delta) + val = max - delta; + + return val; +} + +/* Called via IPI. Call while holding an SRCU reference */ +static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid, + struct mpam_config *cfg) +{ + u32 pri_val = 0; + u16 cmax = MPAMCFG_CMAX_CMAX; + struct mpam_msc *msc = ris->vmsc->msc; + struct mpam_props *rprops = &ris->props; + u16 dspri = GENMASK(rprops->dspri_wd, 0); + u16 intpri = GENMASK(rprops->intpri_wd, 0); + + mutex_lock(&msc->part_sel_lock); + __mpam_part_sel(ris->ris_idx, partid, msc); + + if (mpam_has_feature(mpam_feat_partid_nrw, rprops)) { + /* Update the intpartid mapping */ + mpam_write_partsel_reg(msc, INTPARTID, + MPAMCFG_INTPARTID_INTERNAL | partid); + + /* + * Then switch to the 'internal' partid to update the + * configuration. + */ + __mpam_intpart_sel(ris->ris_idx, partid, msc); + } + + if (mpam_has_feature(mpam_feat_cpor_part, rprops)) { + if (mpam_has_feature(mpam_feat_cpor_part, cfg)) + mpam_write_partsel_reg(msc, CPBM, cfg->cpbm); + else + mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd); + } + + if (mpam_has_feature(mpam_feat_mbw_part, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_part, cfg)) + mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits); + else + mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm); + } + + if (mpam_has_feature(mpam_feat_mbw_min, rprops)) { + u16 val = 0; + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) { + u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops); + + val = mpam_wa_t241_calc_min_from_max(rprops, cfg); + val = max(val, min); + } + + mpam_write_partsel_reg(msc, MBW_MIN, val); + } + + if (mpam_has_feature(mpam_feat_mbw_max, rprops)) { + if (mpam_has_feature(mpam_feat_mbw_max, cfg)) + mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max); + else + mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX); + } + + if (mpam_has_feature(mpam_feat_mbw_prop, rprops)) + mpam_write_partsel_reg(msc, MBW_PROP, 0); + + if (mpam_has_feature(mpam_feat_cmax_cmax, rprops)) + mpam_write_partsel_reg(msc, CMAX, cmax); + + if (mpam_has_feature(mpam_feat_cmax_cmin, rprops)) + mpam_write_partsel_reg(msc, CMIN, 0); + + if (mpam_has_feature(mpam_feat_cmax_cassoc, rprops)) + mpam_write_partsel_reg(msc, CASSOC, MPAMCFG_CASSOC_CASSOC); + + if (mpam_has_feature(mpam_feat_intpri_part, rprops) || + mpam_has_feature(mpam_feat_dspri_part, rprops)) { + /* aces high? */ + if (!mpam_has_feature(mpam_feat_intpri_part_0_low, rprops)) + intpri = 0; + if (!mpam_has_feature(mpam_feat_dspri_part_0_low, rprops)) + dspri = 0; + + if (mpam_has_feature(mpam_feat_intpri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_INTPRI, intpri); + if (mpam_has_feature(mpam_feat_dspri_part, rprops)) + pri_val |= FIELD_PREP(MPAMCFG_PRI_DSPRI, dspri); + + mpam_write_partsel_reg(msc, PRI, pri_val); + } + + mpam_quirk_post_config_change(ris, partid, cfg); + + mutex_unlock(&msc->part_sel_lock); +} + +/* Call with msc cfg_lock held */ +static int mpam_restore_mbwu_state(void *_ris) +{ + int i; + struct mon_read mwbu_arg; + struct mpam_msc_ris *ris = _ris; + struct mpam_class *class = ris->vmsc->comp->class; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + if (ris->mbwu_state[i].enabled) { + mwbu_arg.ris = ris; + mwbu_arg.ctx = &ris->mbwu_state[i].cfg; + mwbu_arg.type = mpam_msmon_choose_counter(class); + + __ris_msmon_read(&mwbu_arg); + } + } + + return 0; +} + +/* Call with MSC cfg_lock held */ +static int mpam_save_mbwu_state(void *arg) +{ + int i; + u64 val; + struct mon_cfg *cfg; + u32 cur_flt, cur_ctl, mon_sel; + struct mpam_msc_ris *ris = arg; + struct msmon_mbwu_state *mbwu_state; + struct mpam_msc *msc = ris->vmsc->msc; + + for (i = 0; i < ris->props.num_mbwu_mon; i++) { + mbwu_state = &ris->mbwu_state[i]; + cfg = &mbwu_state->cfg; + + if (WARN_ON_ONCE(!mpam_mon_sel_lock(msc))) + return -EIO; + + mon_sel = FIELD_PREP(MSMON_CFG_MON_SEL_MON_SEL, i) | + FIELD_PREP(MSMON_CFG_MON_SEL_RIS, ris->ris_idx); + mpam_write_monsel_reg(msc, CFG_MON_SEL, mon_sel); + + cur_flt = mpam_read_monsel_reg(msc, CFG_MBWU_FLT); + cur_ctl = mpam_read_monsel_reg(msc, CFG_MBWU_CTL); + mpam_write_monsel_reg(msc, CFG_MBWU_CTL, 0); + + if (mpam_ris_has_mbwu_long_counter(ris)) { + val = mpam_msc_read_mbwu_l(msc); + mpam_msc_zero_mbwu_l(msc); + } else { + val = mpam_read_monsel_reg(msc, MBWU); + mpam_write_monsel_reg(msc, MBWU, 0); + } + + cfg->mon = i; + cfg->pmg = FIELD_GET(MSMON_CFG_x_FLT_PMG, cur_flt); + cfg->match_pmg = FIELD_GET(MSMON_CFG_x_CTL_MATCH_PMG, cur_ctl); + cfg->partid = FIELD_GET(MSMON_CFG_x_FLT_PARTID, cur_flt); + mbwu_state->correction += val; + mbwu_state->enabled = FIELD_GET(MSMON_CFG_x_CTL_EN, cur_ctl); + mpam_mon_sel_unlock(msc); + } + + return 0; +} + +/* + * Called via smp_call_on_cpu() to prevent migration, while still being + * pre-emptible. Caller must hold mpam_srcu. + */ +static int mpam_reset_ris(void *arg) +{ + u16 partid, partid_max; + struct mpam_config reset_cfg = {}; + struct mpam_msc_ris *ris = arg; + + if (ris->in_reset_state) + return 0; + + spin_lock(&partid_max_lock); + partid_max = mpam_partid_max; + spin_unlock(&partid_max_lock); + for (partid = 0; partid <= partid_max; partid++) + mpam_reprogram_ris_partid(ris, partid, &reset_cfg); + + return 0; +} + +/* + * Get the preferred CPU for this MSC. If it is accessible from this CPU, + * this CPU is preferred. This can be preempted/migrated, it will only result + * in more work. + */ +static int mpam_get_msc_preferred_cpu(struct mpam_msc *msc) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_cpu(cpu, &msc->accessibility)) + return cpu; + + return cpumask_first_and(&msc->accessibility, cpu_online_mask); +} + +static int mpam_touch_msc(struct mpam_msc *msc, int (*fn)(void *a), void *arg) +{ + lockdep_assert_irqs_enabled(); + lockdep_assert_cpus_held(); + WARN_ON_ONCE(!srcu_read_lock_held((&mpam_srcu))); + + return smp_call_on_cpu(mpam_get_msc_preferred_cpu(msc), fn, arg, true); +} + +struct mpam_write_config_arg { + struct mpam_msc_ris *ris; + struct mpam_component *comp; + u16 partid; +}; + +static int __write_config(void *arg) +{ + struct mpam_write_config_arg *c = arg; + + mpam_reprogram_ris_partid(c->ris, c->partid, &c->comp->cfg[c->partid]); + + return 0; +} + +static void mpam_reprogram_msc(struct mpam_msc *msc) +{ + u16 partid; + bool reset; + struct mpam_config *cfg; + struct mpam_msc_ris *ris; + struct mpam_write_config_arg arg; + + /* + * No lock for mpam_partid_max as partid_max_published has been + * set by mpam_enabled(), so the values can no longer change. + */ + mpam_assert_partid_sizes_fixed(); + + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &msc->ris, msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!mpam_is_enabled() && !ris->in_reset_state) { + mpam_touch_msc(msc, &mpam_reset_ris, ris); + ris->in_reset_state = true; + continue; + } + + arg.comp = ris->vmsc->comp; + arg.ris = ris; + reset = true; + for (partid = 0; partid <= mpam_partid_max; partid++) { + cfg = &ris->vmsc->comp->cfg[partid]; + if (!bitmap_empty(cfg->features, MPAM_FEATURE_LAST)) + reset = false; + + arg.partid = partid; + mpam_touch_msc(msc, __write_config, &arg); + } + ris->in_reset_state = reset; + + if (mpam_has_feature(mpam_feat_msmon_mbwu, &ris->props)) + mpam_touch_msc(msc, &mpam_restore_mbwu_state, ris); + } + mutex_unlock(&msc->cfg_lock); +} + +static void _enable_percpu_irq(void *_irq) +{ + int *irq = _irq; + + enable_percpu_irq(*irq, IRQ_TYPE_NONE); +} + +static int mpam_cpu_online(unsigned int cpu) +{ + struct mpam_msc *msc; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (msc->reenable_error_ppi) + _enable_percpu_irq(&msc->reenable_error_ppi); + + if (atomic_fetch_inc(&msc->online_refs) == 0) + mpam_reprogram_msc(msc); + } + + if (mpam_resctrl_enabled) + return mpam_resctrl_online_cpu(cpu); + + return 0; +} + +/* Before mpam is enabled, try to probe new MSC */ +static int mpam_discovery_cpu_online(unsigned int cpu) +{ + int err = 0; + struct mpam_msc *msc; + bool new_device_probed = false; + + if (mpam_is_enabled()) + return 0; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + mutex_lock(&msc->probe_lock); + if (!msc->probed) + err = mpam_msc_hw_probe(msc); + mutex_unlock(&msc->probe_lock); + + if (err) + break; + new_device_probed = true; + } + + if (new_device_probed && !err) + schedule_work(&mpam_enable_work); + if (err) { + mpam_disable_reason = "error during probing"; + schedule_work(&mpam_broken_work); + } + + return err; +} + +static int mpam_cpu_offline(unsigned int cpu) +{ + struct mpam_msc *msc; + + if (mpam_resctrl_enabled) + mpam_resctrl_offline_cpu(cpu); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!cpumask_test_cpu(cpu, &msc->accessibility)) + continue; + + if (msc->reenable_error_ppi) + disable_percpu_irq(msc->reenable_error_ppi); + + if (atomic_dec_and_test(&msc->online_refs)) { + struct mpam_msc_ris *ris; + + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &msc->ris, msc_list, + srcu_read_lock_held(&mpam_srcu)) { + mpam_touch_msc(msc, &mpam_reset_ris, ris); + + /* + * The reset state for non-zero partid may be + * lost while the CPUs are offline. + */ + ris->in_reset_state = false; + + if (mpam_is_enabled()) + mpam_touch_msc(msc, &mpam_save_mbwu_state, ris); + } + mutex_unlock(&msc->cfg_lock); + } + } + + return 0; +} + +static void mpam_register_cpuhp_callbacks(int (*online)(unsigned int online), + int (*offline)(unsigned int offline), + char *name) +{ + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + + mpam_cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, name, online, + offline); + if (mpam_cpuhp_state <= 0) { + pr_err("Failed to register cpuhp callbacks"); + mpam_cpuhp_state = 0; + } + mutex_unlock(&mpam_cpuhp_state_lock); +} + +static int __setup_ppi(struct mpam_msc *msc) +{ + int cpu; + + msc->error_dev_id = alloc_percpu(struct mpam_msc *); + if (!msc->error_dev_id) + return -ENOMEM; + + for_each_cpu(cpu, &msc->accessibility) + *per_cpu_ptr(msc->error_dev_id, cpu) = msc; + + return 0; +} + +static int mpam_msc_setup_error_irq(struct mpam_msc *msc) +{ + int irq; + + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + return 0; + + /* Allocate and initialise the percpu device pointer for PPI */ + if (irq_is_percpu(irq)) + return __setup_ppi(msc); + + /* sanity check: shared interrupts can be routed anywhere? */ + if (!cpumask_equal(&msc->accessibility, cpu_possible_mask)) { + pr_err_once("msc:%u is a private resource with a shared error interrupt", + msc->id); + return -EINVAL; + } + + return 0; +} + +/* + * An MSC can control traffic from a set of CPUs, but may only be accessible + * from a (hopefully wider) set of CPUs. The common reason for this is power + * management. If all the CPUs in a cluster are in PSCI:CPU_SUSPEND, the + * corresponding cache may also be powered off. By making accesses from + * one of those CPUs, we ensure we don't access a cache that's powered off. + */ +static void update_msc_accessibility(struct mpam_msc *msc) +{ + u32 affinity_id; + int err; + + err = device_property_read_u32(&msc->pdev->dev, "cpu_affinity", + &affinity_id); + if (err) + cpumask_copy(&msc->accessibility, cpu_possible_mask); + else + acpi_pptt_get_cpus_from_container(affinity_id, &msc->accessibility); +} + +/* + * There are two ways of reaching a struct mpam_msc_ris. Via the + * class->component->vmsc->ris, or via the msc. + * When destroying the msc, the other side needs unlinking and cleaning up too. + */ +static void mpam_msc_destroy(struct mpam_msc *msc) +{ + struct platform_device *pdev = msc->pdev; + struct mpam_msc_ris *ris, *tmp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry_safe(ris, tmp, &msc->ris, msc_list) + mpam_ris_destroy(ris); + + list_del_rcu(&msc->all_msc_list); + platform_set_drvdata(pdev, NULL); + + add_to_garbage(msc); +} + +static void mpam_msc_drv_remove(struct platform_device *pdev) +{ + struct mpam_msc *msc = platform_get_drvdata(pdev); + + mutex_lock(&mpam_list_lock); + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + + mpam_free_garbage(); +} + +static struct mpam_msc *do_mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + u32 tmp; + struct mpam_msc *msc; + struct resource *msc_res; + struct device *dev = &pdev->dev; + + lockdep_assert_held(&mpam_list_lock); + + msc = devm_kzalloc(&pdev->dev, sizeof(*msc), GFP_KERNEL); + if (!msc) + return ERR_PTR(-ENOMEM); + init_garbage(&msc->garbage); + msc->garbage.pdev = pdev; + + err = devm_mutex_init(dev, &msc->probe_lock); + if (err) + return ERR_PTR(err); + + err = devm_mutex_init(dev, &msc->part_sel_lock); + if (err) + return ERR_PTR(err); + + err = devm_mutex_init(dev, &msc->error_irq_lock); + if (err) + return ERR_PTR(err); + + err = devm_mutex_init(dev, &msc->cfg_lock); + if (err) + return ERR_PTR(err); + + mpam_mon_sel_lock_init(msc); + msc->id = pdev->id; + msc->pdev = pdev; + INIT_LIST_HEAD_RCU(&msc->all_msc_list); + INIT_LIST_HEAD_RCU(&msc->ris); + + update_msc_accessibility(msc); + if (cpumask_empty(&msc->accessibility)) { + dev_err_once(dev, "MSC is not accessible from any CPU!"); + return ERR_PTR(-EINVAL); + } + + err = mpam_msc_setup_error_irq(msc); + if (err) + return ERR_PTR(err); + + if (device_property_read_u32(&pdev->dev, "pcc-channel", &tmp)) + msc->iface = MPAM_IFACE_MMIO; + else + msc->iface = MPAM_IFACE_PCC; + + if (msc->iface == MPAM_IFACE_MMIO) { + void __iomem *io; + + io = devm_platform_get_and_ioremap_resource(pdev, 0, + &msc_res); + if (IS_ERR(io)) { + dev_err_once(dev, "Failed to map MSC base address\n"); + return ERR_CAST(io); + } + msc->mapped_hwpage_sz = msc_res->end - msc_res->start; + msc->mapped_hwpage = io; + } else { + return ERR_PTR(-EINVAL); + } + + list_add_rcu(&msc->all_msc_list, &mpam_all_msc); + platform_set_drvdata(pdev, msc); + + return msc; +} + +static int fw_num_msc; + +static int mpam_msc_drv_probe(struct platform_device *pdev) +{ + int err; + struct mpam_msc *msc = NULL; + void *plat_data = pdev->dev.platform_data; + + mutex_lock(&mpam_list_lock); + msc = do_mpam_msc_drv_probe(pdev); + mutex_unlock(&mpam_list_lock); + + if (IS_ERR(msc)) + return PTR_ERR(msc); + + /* Create RIS entries described by firmware */ + err = acpi_mpam_parse_resources(msc, plat_data); + if (err) { + mpam_msc_drv_remove(pdev); + return err; + } + + if (atomic_add_return(1, &mpam_num_msc) == fw_num_msc) + mpam_register_cpuhp_callbacks(mpam_discovery_cpu_online, NULL, + "mpam:drv_probe"); + + return 0; +} + +static struct platform_driver mpam_msc_driver = { + .driver = { + .name = "mpam_msc", + }, + .probe = mpam_msc_drv_probe, + .remove = mpam_msc_drv_remove, +}; + +/* Any of these features mean the BWA_WD field is valid. */ +static bool mpam_has_bwa_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_mbw_min, props)) + return true; + if (mpam_has_feature(mpam_feat_mbw_max, props)) + return true; + if (mpam_has_feature(mpam_feat_mbw_prop, props)) + return true; + return false; +} + +/* Any of these features mean the CMAX_WD field is valid. */ +static bool mpam_has_cmax_wd_feature(struct mpam_props *props) +{ + if (mpam_has_feature(mpam_feat_cmax_cmax, props)) + return true; + if (mpam_has_feature(mpam_feat_cmax_cmin, props)) + return true; + return false; +} + +#define MISMATCHED_HELPER(parent, child, helper, field, alias) \ + helper(parent) && \ + ((helper(child) && (parent)->field != (child)->field) || \ + (!helper(child) && !(alias))) + +#define MISMATCHED_FEAT(parent, child, feat, field, alias) \ + mpam_has_feature((feat), (parent)) && \ + ((mpam_has_feature((feat), (child)) && (parent)->field != (child)->field) || \ + (!mpam_has_feature((feat), (child)) && !(alias))) + +#define CAN_MERGE_FEAT(parent, child, feat, alias) \ + (alias) && !mpam_has_feature((feat), (parent)) && \ + mpam_has_feature((feat), (child)) + +/* + * Combine two props fields. + * If this is for controls that alias the same resource, it is safe to just + * copy the values over. If two aliasing controls implement the same scheme + * a safe value must be picked. + * For non-aliasing controls, these control different resources, and the + * resulting safe value must be compatible with both. When merging values in + * the tree, all the aliasing resources must be handled first. + * On mismatch, parent is modified. + * Quirks on an MSC will apply to all MSC in that class. + */ +static void __props_mismatch(struct mpam_props *parent, + struct mpam_props *child, bool alias) +{ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cpor_part, alias)) { + parent->cpbm_wd = child->cpbm_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cpor_part, + cpbm_wd, alias)) { + pr_debug("cleared cpor_part\n"); + mpam_clear_feature(mpam_feat_cpor_part, parent); + parent->cpbm_wd = 0; + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_mbw_part, alias)) { + parent->mbw_pbm_bits = child->mbw_pbm_bits; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_mbw_part, + mbw_pbm_bits, alias)) { + pr_debug("cleared mbw_part\n"); + mpam_clear_feature(mpam_feat_mbw_part, parent); + parent->mbw_pbm_bits = 0; + } + + /* bwa_wd is a count of bits, fewer bits means less precision */ + if (alias && !mpam_has_bwa_wd_feature(parent) && + mpam_has_bwa_wd_feature(child)) { + parent->bwa_wd = child->bwa_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_bwa_wd_feature, + bwa_wd, alias)) { + pr_debug("took the min bwa_wd\n"); + parent->bwa_wd = min(parent->bwa_wd, child->bwa_wd); + } + + if (alias && !mpam_has_cmax_wd_feature(parent) && mpam_has_cmax_wd_feature(child)) { + parent->cmax_wd = child->cmax_wd; + } else if (MISMATCHED_HELPER(parent, child, mpam_has_cmax_wd_feature, + cmax_wd, alias)) { + pr_debug("%s took the min cmax_wd\n", __func__); + parent->cmax_wd = min(parent->cmax_wd, child->cmax_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_cmax_cassoc, alias)) { + parent->cassoc_wd = child->cassoc_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_cmax_cassoc, + cassoc_wd, alias)) { + pr_debug("%s cleared cassoc_wd\n", __func__); + mpam_clear_feature(mpam_feat_cmax_cassoc, parent); + parent->cassoc_wd = 0; + } + + /* For num properties, take the minimum */ + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_csu, alias)) { + parent->num_csu_mon = child->num_csu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_csu, + num_csu_mon, alias)) { + pr_debug("took the min num_csu_mon\n"); + parent->num_csu_mon = min(parent->num_csu_mon, + child->num_csu_mon); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_msmon_mbwu, alias)) { + parent->num_mbwu_mon = child->num_mbwu_mon; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_msmon_mbwu, + num_mbwu_mon, alias)) { + pr_debug("took the min num_mbwu_mon\n"); + parent->num_mbwu_mon = min(parent->num_mbwu_mon, + child->num_mbwu_mon); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_intpri_part, alias)) { + parent->intpri_wd = child->intpri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_intpri_part, + intpri_wd, alias)) { + pr_debug("%s took the min intpri_wd\n", __func__); + parent->intpri_wd = min(parent->intpri_wd, child->intpri_wd); + } + + if (CAN_MERGE_FEAT(parent, child, mpam_feat_dspri_part, alias)) { + parent->dspri_wd = child->dspri_wd; + } else if (MISMATCHED_FEAT(parent, child, mpam_feat_dspri_part, + dspri_wd, alias)) { + pr_debug("%s took the min dspri_wd\n", __func__); + parent->dspri_wd = min(parent->dspri_wd, child->dspri_wd); + } + + /* TODO: alias support for these two */ + /* {int,ds}pri may not have differing 0-low behaviour */ + if (mpam_has_feature(mpam_feat_intpri_part, parent) && + (!mpam_has_feature(mpam_feat_intpri_part, child) || + mpam_has_feature(mpam_feat_intpri_part_0_low, parent) != + mpam_has_feature(mpam_feat_intpri_part_0_low, child))) { + pr_debug("%s cleared intpri_part\n", __func__); + mpam_clear_feature(mpam_feat_intpri_part, parent); + mpam_clear_feature(mpam_feat_intpri_part_0_low, parent); + } + if (mpam_has_feature(mpam_feat_dspri_part, parent) && + (!mpam_has_feature(mpam_feat_dspri_part, child) || + mpam_has_feature(mpam_feat_dspri_part_0_low, parent) != + mpam_has_feature(mpam_feat_dspri_part_0_low, child))) { + pr_debug("%s cleared dspri_part\n", __func__); + mpam_clear_feature(mpam_feat_dspri_part, parent); + mpam_clear_feature(mpam_feat_dspri_part_0_low, parent); + } + + if (alias) { + /* Merge features for aliased resources */ + bitmap_or(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } else { + /* Clear missing features for non aliasing */ + bitmap_and(parent->features, parent->features, child->features, MPAM_FEATURE_LAST); + } +} + +/* + * If a vmsc doesn't match class feature/configuration, do the right thing(tm). + * For 'num' properties we can just take the minimum. + * For properties where the mismatched unused bits would make a difference, we + * nobble the class feature, as we can't configure all the resources. + * e.g. The L3 cache is composed of two resources with 13 and 17 portion + * bitmaps respectively. + * Quirks on an MSC will apply to all MSC in that class. + */ +static void +__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc) +{ + struct mpam_props *cprops = &class->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify class */ + + dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n", + (long)cprops->features, (long)vprops->features); + + /* Merge quirks */ + class->quirks |= vmsc->msc->quirks; + + /* Take the safe value for any common features */ + __props_mismatch(cprops, vprops, false); +} + +static void +__vmsc_props_mismatch(struct mpam_vmsc *vmsc, struct mpam_msc_ris *ris) +{ + struct mpam_props *rprops = &ris->props; + struct mpam_props *vprops = &vmsc->props; + struct device *dev = &vmsc->msc->pdev->dev; + + lockdep_assert_held(&mpam_list_lock); /* we modify vmsc */ + + dev_dbg(dev, "Merging features for vmsc:0x%lx |= ris:0x%lx\n", + (long)vprops->features, (long)rprops->features); + + /* + * Merge mismatched features - Copy any features that aren't common, + * but take the safe value for any common features. + */ + __props_mismatch(vprops, rprops, true); +} + +/* + * Copy the first component's first vMSC's properties and features to the + * class. __class_props_mismatch() will remove conflicts. + * It is not possible to have a class with no components, or a component with + * no resources. The vMSC properties have already been built. + */ +static void mpam_enable_init_class_features(struct mpam_class *class) +{ + struct mpam_vmsc *vmsc; + struct mpam_component *comp; + + comp = list_first_entry(&class->components, + struct mpam_component, class_list); + vmsc = list_first_entry(&comp->vmsc, + struct mpam_vmsc, comp_list); + + class->props = vmsc->props; +} + +static void mpam_enable_merge_vmsc_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + __vmsc_props_mismatch(vmsc, ris); + class->nrdy_usec = max(class->nrdy_usec, + vmsc->msc->nrdy_usec); + } + } +} + +static void mpam_enable_merge_class_features(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + struct mpam_class *class = comp->class; + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) + __class_props_mismatch(class, vmsc); + + if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class)) + mpam_clear_feature(mpam_feat_mbw_min, &class->props); +} + +/* + * Merge all the common resource features into class. + * vmsc features are bitwise-or'd together by mpam_enable_merge_vmsc_features() + * as the first step so that mpam_enable_init_class_features() can initialise + * the class with a representative set of features. + * Next the mpam_enable_merge_class_features() bitwise-and's all the vmsc + * features to form the class features. + * Other features are the min/max as appropriate. + * + * To avoid walking the whole tree twice, the class->nrdy_usec property is + * updated when working with the vmsc as it is a max(), and doesn't need + * initialising first. + */ +static void mpam_enable_merge_features(struct list_head *all_classes_list) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, all_classes_list, classes_list) { + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_vmsc_features(comp); + + mpam_enable_init_class_features(class); + + list_for_each_entry(comp, &class->components, class_list) + mpam_enable_merge_class_features(comp); + } +} + +static char *mpam_errcode_names[16] = { + [MPAM_ERRCODE_NONE] = "No error", + [MPAM_ERRCODE_PARTID_SEL_RANGE] = "PARTID_SEL_Range", + [MPAM_ERRCODE_REQ_PARTID_RANGE] = "Req_PARTID_Range", + [MPAM_ERRCODE_MSMONCFG_ID_RANGE] = "MSMONCFG_ID_RANGE", + [MPAM_ERRCODE_REQ_PMG_RANGE] = "Req_PMG_Range", + [MPAM_ERRCODE_MONITOR_RANGE] = "Monitor_Range", + [MPAM_ERRCODE_INTPARTID_RANGE] = "intPARTID_Range", + [MPAM_ERRCODE_UNEXPECTED_INTERNAL] = "Unexpected_INTERNAL", + [MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL] = "Undefined_RIS_PART_SEL", + [MPAM_ERRCODE_RIS_NO_CONTROL] = "RIS_No_Control", + [MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL] = "Undefined_RIS_MON_SEL", + [MPAM_ERRCODE_RIS_NO_MONITOR] = "RIS_No_Monitor", + [12 ... 15] = "Reserved" +}; + +static int mpam_enable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, MPAMF_ECR_INTEN); + + return 0; +} + +/* This can run in mpam_disable(), and the interrupt handler on the same CPU */ +static int mpam_disable_msc_ecr(void *_msc) +{ + struct mpam_msc *msc = _msc; + + __mpam_write_reg(msc, MPAMF_ECR, 0); + + return 0; +} + +static irqreturn_t __mpam_irq_handler(int irq, struct mpam_msc *msc) +{ + u64 reg; + u16 partid; + u8 errcode, pmg, ris; + + if (WARN_ON_ONCE(!msc) || + WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), + &msc->accessibility))) + return IRQ_NONE; + + reg = mpam_msc_read_esr(msc); + + errcode = FIELD_GET(MPAMF_ESR_ERRCODE, reg); + if (!errcode) + return IRQ_NONE; + + /* Clear level triggered irq */ + mpam_msc_clear_esr(msc); + + partid = FIELD_GET(MPAMF_ESR_PARTID_MON, reg); + pmg = FIELD_GET(MPAMF_ESR_PMG, reg); + ris = FIELD_GET(MPAMF_ESR_RIS, reg); + + pr_err_ratelimited("error irq from msc:%u '%s', partid:%u, pmg: %u, ris: %u\n", + msc->id, mpam_errcode_names[errcode], partid, pmg, + ris); + + /* Disable this interrupt. */ + mpam_disable_msc_ecr(msc); + + /* Are we racing with the thread disabling MPAM? */ + if (!mpam_is_enabled()) + return IRQ_HANDLED; + + /* + * Schedule the teardown work. Don't use a threaded IRQ as we can't + * unregister the interrupt from the threaded part of the handler. + */ + mpam_disable_reason = "hardware error interrupt"; + schedule_work(&mpam_broken_work); + + return IRQ_HANDLED; +} + +static irqreturn_t mpam_ppi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = *(struct mpam_msc **)dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static irqreturn_t mpam_spi_handler(int irq, void *dev_id) +{ + struct mpam_msc *msc = dev_id; + + return __mpam_irq_handler(irq, msc); +} + +static int mpam_register_irqs(void) +{ + int err, irq; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + /* The MPAM spec says the interrupt can be SPI, PPI or LPI */ + /* We anticipate sharing the interrupt with other MSCs */ + if (irq_is_percpu(irq)) { + err = request_percpu_irq(irq, &mpam_ppi_handler, + "mpam:msc:error", + msc->error_dev_id); + if (err) + return err; + + msc->reenable_error_ppi = irq; + smp_call_function_many(&msc->accessibility, + &_enable_percpu_irq, &irq, + true); + } else { + err = devm_request_irq(&msc->pdev->dev, irq, + &mpam_spi_handler, IRQF_SHARED, + "mpam:msc:error", msc); + if (err) + return err; + } + + mutex_lock(&msc->error_irq_lock); + msc->error_irq_req = true; + mpam_touch_msc(msc, mpam_enable_msc_ecr, msc); + msc->error_irq_hw_enabled = true; + mutex_unlock(&msc->error_irq_lock); + } + + return 0; +} + +static void mpam_unregister_irqs(void) +{ + int irq; + struct mpam_msc *msc; + + guard(cpus_read_lock)(); + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + irq = platform_get_irq_byname_optional(msc->pdev, "error"); + if (irq <= 0) + continue; + + mutex_lock(&msc->error_irq_lock); + if (msc->error_irq_hw_enabled) { + mpam_touch_msc(msc, mpam_disable_msc_ecr, msc); + msc->error_irq_hw_enabled = false; + } + + if (msc->error_irq_req) { + if (irq_is_percpu(irq)) { + msc->reenable_error_ppi = 0; + free_percpu_irq(irq, msc->error_dev_id); + } else { + devm_free_irq(&msc->pdev->dev, irq, msc); + } + msc->error_irq_req = false; + } + mutex_unlock(&msc->error_irq_lock); + } +} + +static void __destroy_component_cfg(struct mpam_component *comp) +{ + struct mpam_msc *msc; + struct mpam_vmsc *vmsc; + struct mpam_msc_ris *ris; + + lockdep_assert_held(&mpam_list_lock); + + add_to_garbage(comp->cfg); + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + msc = vmsc->msc; + + if (mpam_mon_sel_lock(msc)) { + list_for_each_entry(ris, &vmsc->ris, vmsc_list) + add_to_garbage(ris->mbwu_state); + mpam_mon_sel_unlock(msc); + } + } +} + +static void mpam_reset_component_cfg(struct mpam_component *comp) +{ + int i; + struct mpam_props *cprops = &comp->class->props; + + mpam_assert_partid_sizes_fixed(); + + if (!comp->cfg) + return; + + for (i = 0; i <= mpam_partid_max; i++) { + comp->cfg[i] = (struct mpam_config) {}; + if (cprops->cpbm_wd) + comp->cfg[i].cpbm = GENMASK(cprops->cpbm_wd - 1, 0); + if (cprops->mbw_pbm_bits) + comp->cfg[i].mbw_pbm = GENMASK(cprops->mbw_pbm_bits - 1, 0); + if (cprops->bwa_wd) + comp->cfg[i].mbw_max = GENMASK(15, 16 - cprops->bwa_wd); + } +} + +static int __allocate_component_cfg(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + + mpam_assert_partid_sizes_fixed(); + + if (comp->cfg) + return 0; + + comp->cfg = kcalloc(mpam_partid_max + 1, sizeof(*comp->cfg), GFP_KERNEL); + if (!comp->cfg) + return -ENOMEM; + + /* + * The array is free()d in one go, so only cfg[0]'s structure needs + * to be initialised. + */ + init_garbage(&comp->cfg[0].garbage); + + mpam_reset_component_cfg(comp); + + list_for_each_entry(vmsc, &comp->vmsc, comp_list) { + struct mpam_msc *msc; + struct mpam_msc_ris *ris; + struct msmon_mbwu_state *mbwu_state; + + if (!vmsc->props.num_mbwu_mon) + continue; + + msc = vmsc->msc; + list_for_each_entry(ris, &vmsc->ris, vmsc_list) { + if (!ris->props.num_mbwu_mon) + continue; + + mbwu_state = kcalloc(ris->props.num_mbwu_mon, + sizeof(*ris->mbwu_state), + GFP_KERNEL); + if (!mbwu_state) { + __destroy_component_cfg(comp); + return -ENOMEM; + } + + init_garbage(&mbwu_state[0].garbage); + + if (mpam_mon_sel_lock(msc)) { + ris->mbwu_state = mbwu_state; + mpam_mon_sel_unlock(msc); + } + } + } + + return 0; +} + +static int mpam_allocate_config(void) +{ + struct mpam_class *class; + struct mpam_component *comp; + + lockdep_assert_held(&mpam_list_lock); + + list_for_each_entry(class, &mpam_classes, classes_list) { + list_for_each_entry(comp, &class->components, class_list) { + int err = __allocate_component_cfg(comp); + if (err) + return err; + } + } + + return 0; +} + +static void mpam_enable_once(void) +{ + int err; + + /* + * Once the cpuhp callbacks have been changed, mpam_partid_max can no + * longer change. + */ + spin_lock(&partid_max_lock); + partid_max_published = true; + spin_unlock(&partid_max_lock); + + /* + * If all the MSC have been probed, enabling the IRQs happens next. + * That involves cross-calling to a CPU that can reach the MSC, and + * the locks must be taken in this order: + */ + cpus_read_lock(); + mutex_lock(&mpam_list_lock); + do { + mpam_enable_merge_features(&mpam_classes); + + err = mpam_register_irqs(); + if (err) { + pr_warn("Failed to register irqs: %d\n", err); + break; + } + + err = mpam_allocate_config(); + if (err) { + pr_err("Failed to allocate configuration arrays.\n"); + break; + } + } while (0); + mutex_unlock(&mpam_list_lock); + cpus_read_unlock(); + + if (!err) { + err = mpam_resctrl_setup(); + if (err) + pr_err("Failed to initialise resctrl: %d\n", err); + } + + if (err) { + mpam_disable_reason = "Failed to enable."; + schedule_work(&mpam_broken_work); + return; + } + + static_branch_enable(&mpam_enabled); + mpam_resctrl_enabled = true; + mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline, + "mpam:online"); + + /* Use printk() to avoid the pr_fmt adding the function name. */ + printk(KERN_INFO "MPAM enabled with %u PARTIDs and %u PMGs\n", + mpam_partid_max + 1, mpam_pmg_max + 1); +} + +static void mpam_reset_component_locked(struct mpam_component *comp) +{ + struct mpam_vmsc *vmsc; + + lockdep_assert_cpus_held(); + mpam_assert_partid_sizes_fixed(); + + mpam_reset_component_cfg(comp); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_msc *msc = vmsc->msc; + struct mpam_msc_ris *ris; + + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + if (!ris->in_reset_state) + mpam_touch_msc(msc, mpam_reset_ris, ris); + ris->in_reset_state = true; + } + } +} + +void mpam_reset_class_locked(struct mpam_class *class) +{ + struct mpam_component *comp; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) + mpam_reset_component_locked(comp); +} + +static void mpam_reset_class(struct mpam_class *class) +{ + cpus_read_lock(); + mpam_reset_class_locked(class); + cpus_read_unlock(); +} + +/* + * Called in response to an error IRQ. + * All of MPAMs errors indicate a software bug, restore any modified + * controls to their reset values. + */ +void mpam_disable(struct work_struct *ignored) +{ + int idx; + bool do_resctrl_exit; + struct mpam_class *class; + struct mpam_msc *msc, *tmp; + + if (mpam_is_enabled()) + static_branch_disable(&mpam_enabled); + + mutex_lock(&mpam_cpuhp_state_lock); + if (mpam_cpuhp_state) { + cpuhp_remove_state(mpam_cpuhp_state); + mpam_cpuhp_state = 0; + } + + /* + * Removing the cpuhp state called mpam_cpu_offline() and told resctrl + * all the CPUs are offline. + */ + do_resctrl_exit = mpam_resctrl_enabled; + mpam_resctrl_enabled = false; + mutex_unlock(&mpam_cpuhp_state_lock); + + if (do_resctrl_exit) + mpam_resctrl_exit(); + + mpam_unregister_irqs(); + + idx = srcu_read_lock(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + mpam_reset_class(class); + if (do_resctrl_exit) + mpam_resctrl_teardown_class(class); + } + srcu_read_unlock(&mpam_srcu, idx); + + mutex_lock(&mpam_list_lock); + list_for_each_entry_safe(msc, tmp, &mpam_all_msc, all_msc_list) + mpam_msc_destroy(msc); + mutex_unlock(&mpam_list_lock); + mpam_free_garbage(); + + pr_err_once("MPAM disabled due to %s\n", mpam_disable_reason); +} + +/* + * Enable mpam once all devices have been probed. + * Scheduled by mpam_discovery_cpu_online() once all devices have been created. + * Also scheduled when new devices are probed when new CPUs come online. + */ +void mpam_enable(struct work_struct *work) +{ + static atomic_t once; + struct mpam_msc *msc; + bool all_devices_probed = true; + + /* Have we probed all the hw devices? */ + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list, + srcu_read_lock_held(&mpam_srcu)) { + mutex_lock(&msc->probe_lock); + if (!msc->probed) + all_devices_probed = false; + mutex_unlock(&msc->probe_lock); + + if (!all_devices_probed) + break; + } + + if (all_devices_probed && !atomic_fetch_inc(&once)) + mpam_enable_once(); +} + +#define maybe_update_config(cfg, feature, newcfg, member, changes) do { \ + if (mpam_has_feature(feature, newcfg) && \ + (newcfg)->member != (cfg)->member) { \ + (cfg)->member = (newcfg)->member; \ + mpam_set_feature(feature, cfg); \ + \ + (changes) = true; \ + } \ +} while (0) + +static bool mpam_update_config(struct mpam_config *cfg, + const struct mpam_config *newcfg) +{ + bool has_changes = false; + + maybe_update_config(cfg, mpam_feat_cpor_part, newcfg, cpbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_part, newcfg, mbw_pbm, has_changes); + maybe_update_config(cfg, mpam_feat_mbw_max, newcfg, mbw_max, has_changes); + + return has_changes; +} + +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg) +{ + struct mpam_write_config_arg arg; + struct mpam_msc_ris *ris; + struct mpam_vmsc *vmsc; + struct mpam_msc *msc; + + lockdep_assert_cpus_held(); + + /* Don't pass in the current config! */ + WARN_ON_ONCE(&comp->cfg[partid] == cfg); + + if (!mpam_update_config(&comp->cfg[partid], cfg)) + return 0; + + arg.comp = comp; + arg.partid = partid; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(vmsc, &comp->vmsc, comp_list, + srcu_read_lock_held(&mpam_srcu)) { + msc = vmsc->msc; + + mutex_lock(&msc->cfg_lock); + list_for_each_entry_srcu(ris, &vmsc->ris, vmsc_list, + srcu_read_lock_held(&mpam_srcu)) { + arg.ris = ris; + mpam_touch_msc(msc, __write_config, &arg); + ris->in_reset_state = false; + } + mutex_unlock(&msc->cfg_lock); + } + + return 0; +} + +static int __init mpam_msc_driver_init(void) +{ + if (!system_supports_mpam()) + return -EOPNOTSUPP; + + init_srcu_struct(&mpam_srcu); + + fw_num_msc = acpi_mpam_count_msc(); + if (fw_num_msc <= 0) { + pr_err("No MSC devices found in firmware\n"); + return -EINVAL; + } + + return platform_driver_register(&mpam_msc_driver); +} + +/* Must occur after arm64_mpam_register_cpus() from arch_initcall() */ +subsys_initcall(mpam_msc_driver_init); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_devices.c" +#endif diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h new file mode 100644 index 0000000000000..7b7dfa7b80a79 --- /dev/null +++ b/drivers/resctrl/mpam_internal.h @@ -0,0 +1,752 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2025 Arm Ltd. + +#ifndef MPAM_INTERNAL_H +#define MPAM_INTERNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define MPAM_MSC_MAX_NUM_RIS 16 + +struct platform_device; + +#ifdef CONFIG_MPAM_KUNIT_TEST +#define PACKED_FOR_KUNIT __packed +#else +#define PACKED_FOR_KUNIT +#endif + +/* + * This 'mon' values must not alias an actual monitor, so must be larger than + * U16_MAX, but not be confused with an errno value, so smaller than + * (u32)-SZ_4K. + * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor. + */ +#define USE_PRE_ALLOCATED (U16_MAX + 1) + +static inline bool mpam_is_enabled(void) +{ + return static_branch_likely(&mpam_enabled); +} + +/* + * Structures protected by SRCU may not be freed for a surprising amount of + * time (especially if perf is running). To ensure the MPAM error interrupt can + * tear down all the structures, build a list of objects that can be garbage + * collected once synchronize_srcu() has returned. + * If pdev is non-NULL, use devm_kfree(). + */ +struct mpam_garbage { + /* member of mpam_garbage */ + struct llist_node llist; + + void *to_free; + struct platform_device *pdev; +}; + +struct mpam_msc { + /* member of mpam_all_msc */ + struct list_head all_msc_list; + + int id; + struct platform_device *pdev; + + /* Not modified after mpam_is_enabled() becomes true */ + enum mpam_msc_iface iface; + u32 nrdy_usec; + cpumask_t accessibility; + bool has_extd_esr; + + int reenable_error_ppi; + struct mpam_msc * __percpu *error_dev_id; + + atomic_t online_refs; + + /* + * probe_lock is only taken during discovery. After discovery these + * properties become read-only and the lists are protected by SRCU. + */ + struct mutex probe_lock; + bool probed; + u16 partid_max; + u8 pmg_max; + unsigned long ris_idxs; + u32 ris_max; + u32 iidr; + u16 quirks; + + /* + * error_irq_lock is taken when registering/unregistering the error + * interrupt and maniupulating the below flags. + */ + struct mutex error_irq_lock; + bool error_irq_req; + bool error_irq_hw_enabled; + + /* mpam_msc_ris of this component */ + struct list_head ris; + + /* + * part_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_PART_SEL. (including the ID registers that vary + * by RIS). + * If needed, take msc->probe_lock first. + */ + struct mutex part_sel_lock; + + /* + * cfg_lock protects the msc configuration and guards against mbwu_state + * save and restore racing. + */ + struct mutex cfg_lock; + + /* + * mon_sel_lock protects access to the MSC hardware registers that are + * affected by MPAMCFG_MON_SEL, and the mbwu_state. + * Access to mon_sel is needed from both process and interrupt contexts, + * but is complicated by firmware-backed platforms that can't make any + * access unless they can sleep. + * Always use the mpam_mon_sel_lock() helpers. + * Accesses to mon_sel need to be able to fail if they occur in the wrong + * context. + * If needed, take msc->probe_lock first. + */ + raw_spinlock_t _mon_sel_lock; + unsigned long _mon_sel_flags; + + void __iomem *mapped_hwpage; + size_t mapped_hwpage_sz; + + /* Values only used on some platforms for quirks */ + u32 t241_id; + + struct mpam_garbage garbage; +}; + +/* Returning false here means accesses to mon_sel must fail and report an error. */ +static inline bool __must_check mpam_mon_sel_lock(struct mpam_msc *msc) +{ + /* Locking will require updating to support a firmware backed interface */ + if (WARN_ON_ONCE(msc->iface != MPAM_IFACE_MMIO)) + return false; + + raw_spin_lock_irqsave(&msc->_mon_sel_lock, msc->_mon_sel_flags); + return true; +} + +static inline void mpam_mon_sel_unlock(struct mpam_msc *msc) +{ + raw_spin_unlock_irqrestore(&msc->_mon_sel_lock, msc->_mon_sel_flags); +} + +static inline void mpam_mon_sel_lock_held(struct mpam_msc *msc) +{ + lockdep_assert_held_once(&msc->_mon_sel_lock); +} + +static inline void mpam_mon_sel_lock_init(struct mpam_msc *msc) +{ + raw_spin_lock_init(&msc->_mon_sel_lock); +} + +/* Bits for mpam features bitmaps */ +enum mpam_device_features { + mpam_feat_cpor_part, + mpam_feat_cmax_softlim, + mpam_feat_cmax_cmax, + mpam_feat_cmax_cmin, + mpam_feat_cmax_cassoc, + mpam_feat_mbw_part, + mpam_feat_mbw_min, + mpam_feat_mbw_max, + mpam_feat_mbw_prop, + mpam_feat_intpri_part, + mpam_feat_intpri_part_0_low, + mpam_feat_dspri_part, + mpam_feat_dspri_part_0_low, + mpam_feat_msmon, + mpam_feat_msmon_csu, + mpam_feat_msmon_csu_capture, + mpam_feat_msmon_csu_xcl, + mpam_feat_msmon_csu_hw_nrdy, + mpam_feat_msmon_mbwu, + mpam_feat_msmon_mbwu_31counter, + mpam_feat_msmon_mbwu_44counter, + mpam_feat_msmon_mbwu_63counter, + mpam_feat_msmon_mbwu_capture, + mpam_feat_msmon_mbwu_rwbw, + mpam_feat_msmon_mbwu_hw_nrdy, + mpam_feat_partid_nrw, + MPAM_FEATURE_LAST +}; + +struct mpam_props { + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u16 cpbm_wd; + u16 mbw_pbm_bits; + u16 bwa_wd; + u16 cmax_wd; + u16 cassoc_wd; + u16 intpri_wd; + u16 dspri_wd; + u16 num_csu_mon; + u16 num_mbwu_mon; + +/* + * Kunit tests use memset() to set up feature combinations that should be + * removed, and will false-positive if the compiler introduces padding that + * isn't cleared during sanitisation. + */ +} PACKED_FOR_KUNIT; + +#define mpam_has_feature(_feat, x) test_bit(_feat, (x)->features) +#define mpam_set_feature(_feat, x) set_bit(_feat, (x)->features) +#define mpam_clear_feature(_feat, x) clear_bit(_feat, (x)->features) + +/* Workaround bits for msc->quirks */ +enum mpam_device_quirks { + T241_SCRUB_SHADOW_REGS, + T241_FORCE_MBW_MIN_TO_ONE, + T241_MBW_COUNTER_SCALE_64, + IGNORE_CSU_NRDY, + MPAM_QUIRK_LAST +}; + +#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks)) +#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk))) + +struct mpam_quirk { + int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk); + + u32 iidr; + u32 iidr_mask; + + enum mpam_device_quirks workaround; +}; + +#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff)) + +#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b)) + +#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \ + FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b)) + +/* The values for MSMON_CFG_MBWU_FLT.RWBW */ +enum mon_filter_options { + COUNT_BOTH = 0, + COUNT_WRITE = 1, + COUNT_READ = 2, +}; + +struct mon_cfg { + /* + * mon must be large enough to hold out of range values like + * USE_PRE_ALLOCATED + */ + u32 mon; + u8 pmg; + bool match_pmg; + bool csu_exclude_clean; + u32 partid; + enum mon_filter_options opts; +}; + +/* Changes to msmon_mbwu_state are protected by the msc's mon_sel_lock. */ +struct msmon_mbwu_state { + bool enabled; + bool reset_on_next_read; + struct mon_cfg cfg; + + /* + * The value to add to the new reading to account for power management, + * and overflow. + */ + u64 correction; + + struct mpam_garbage garbage; +}; + +struct mpam_class { + /* mpam_components in this class */ + struct list_head components; + + cpumask_t affinity; + + struct mpam_props props; + u32 nrdy_usec; + u16 quirks; + u8 level; + enum mpam_class_types type; + + /* member of mpam_classes */ + struct list_head classes_list; + + struct ida ida_csu_mon; + struct ida ida_mbwu_mon; + + struct mpam_garbage garbage; +}; + +struct mpam_config { + /* Which configuration values are valid. */ + DECLARE_BITMAP(features, MPAM_FEATURE_LAST); + + u32 cpbm; + u32 mbw_pbm; + u16 mbw_max; + + struct mpam_garbage garbage; +}; + +struct mpam_component { + u32 comp_id; + + /* mpam_vmsc in this component */ + struct list_head vmsc; + + cpumask_t affinity; + + /* + * Array of configuration values, indexed by partid. + * Read from cpuhp callbacks, hold the cpuhp lock when writing. + */ + struct mpam_config *cfg; + + /* member of mpam_class:components */ + struct list_head class_list; + + /* parent: */ + struct mpam_class *class; + + struct mpam_garbage garbage; +}; + +struct mpam_vmsc { + /* member of mpam_component:vmsc_list */ + struct list_head comp_list; + + /* mpam_msc_ris in this vmsc */ + struct list_head ris; + + struct mpam_props props; + + /* All RIS in this vMSC are members of this MSC */ + struct mpam_msc *msc; + + /* parent: */ + struct mpam_component *comp; + + struct mpam_garbage garbage; +}; + +struct mpam_msc_ris { + u8 ris_idx; + u64 idr; + struct mpam_props props; + bool in_reset_state; + + cpumask_t affinity; + + /* member of mpam_vmsc:ris */ + struct list_head vmsc_list; + + /* member of mpam_msc:ris */ + struct list_head msc_list; + + /* parent: */ + struct mpam_vmsc *vmsc; + + /* msmon mbwu configuration is preserved over reset */ + struct msmon_mbwu_state *mbwu_state; + + struct mpam_garbage garbage; +}; + +struct mpam_resctrl_dom { + struct mpam_component *ctrl_comp; + + /* + * There is no single mon_comp because different events may be backed + * by different class/components. mon_comp is indexed by the event + * number. + */ + struct mpam_component *mon_comp[QOS_NUM_EVENTS]; + + struct rdt_ctrl_domain resctrl_ctrl_dom; + struct rdt_l3_mon_domain resctrl_mon_dom; +}; + +struct mpam_resctrl_res { + struct mpam_class *class; + struct rdt_resource resctrl_res; + bool cdp_enabled; +}; + +struct mpam_resctrl_mon { + struct mpam_class *class; + + /* per-class data that resctrl needs will live here */ +}; + +static inline int mpam_alloc_csu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_csu_mon, cprops->num_csu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_csu_mon(struct mpam_class *class, int csu_mon) +{ + ida_free(&class->ida_csu_mon, csu_mon); +} + +static inline int mpam_alloc_mbwu_mon(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_mbwu, cprops)) + return -EOPNOTSUPP; + + return ida_alloc_max(&class->ida_mbwu_mon, cprops->num_mbwu_mon - 1, + GFP_KERNEL); +} + +static inline void mpam_free_mbwu_mon(struct mpam_class *class, int mbwu_mon) +{ + ida_free(&class->ida_mbwu_mon, mbwu_mon); +} + +/* List of all classes - protected by srcu*/ +extern struct srcu_struct mpam_srcu; +extern struct list_head mpam_classes; + +/* System wide partid/pmg values */ +extern u16 mpam_partid_max; +extern u8 mpam_pmg_max; + +/* Scheduled work callback to enable mpam once all MSC have been probed */ +void mpam_enable(struct work_struct *work); +void mpam_disable(struct work_struct *work); + +/* Reset all the RIS in a class under cpus_read_lock() */ +void mpam_reset_class_locked(struct mpam_class *class); + +int mpam_apply_config(struct mpam_component *comp, u16 partid, + struct mpam_config *cfg); + +int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx, + enum mpam_device_features, u64 *val); +void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx); + +int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level, + cpumask_t *affinity); + +#ifdef CONFIG_RESCTRL_FS +int mpam_resctrl_setup(void); +void mpam_resctrl_exit(void); +int mpam_resctrl_online_cpu(unsigned int cpu); +void mpam_resctrl_offline_cpu(unsigned int cpu); +void mpam_resctrl_teardown_class(struct mpam_class *class); +#else +static inline int mpam_resctrl_setup(void) { return 0; } +static inline void mpam_resctrl_exit(void) { } +static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; } +static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { } +static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { } +#endif /* CONFIG_RESCTRL_FS */ + +/* + * MPAM MSCs have the following register layout. See: + * Arm Memory System Resource Partitioning and Monitoring (MPAM) System + * Component Specification. + * https://developer.arm.com/documentation/ihi0099/aa/ + */ +#define MPAM_ARCHITECTURE_V1 0x10 + +/* Memory mapped control pages */ +/* ID Register offsets in the memory mapped page */ +#define MPAMF_IDR 0x0000 /* features id register */ +#define MPAMF_IIDR 0x0018 /* implementer id register */ +#define MPAMF_AIDR 0x0020 /* architectural id register */ +#define MPAMF_IMPL_IDR 0x0028 /* imp-def partitioning */ +#define MPAMF_CPOR_IDR 0x0030 /* cache-portion partitioning */ +#define MPAMF_CCAP_IDR 0x0038 /* cache-capacity partitioning */ +#define MPAMF_MBW_IDR 0x0040 /* mem-bw partitioning */ +#define MPAMF_PRI_IDR 0x0048 /* priority partitioning */ +#define MPAMF_MSMON_IDR 0x0080 /* performance monitoring features */ +#define MPAMF_CSUMON_IDR 0x0088 /* cache-usage monitor */ +#define MPAMF_MBWUMON_IDR 0x0090 /* mem-bw usage monitor */ +#define MPAMF_PARTID_NRW_IDR 0x0050 /* partid-narrowing */ + +/* Configuration and Status Register offsets in the memory mapped page */ +#define MPAMCFG_PART_SEL 0x0100 /* partid to configure */ +#define MPAMCFG_CPBM 0x1000 /* cache-portion config */ +#define MPAMCFG_CMAX 0x0108 /* cache-capacity config */ +#define MPAMCFG_CMIN 0x0110 /* cache-capacity config */ +#define MPAMCFG_CASSOC 0x0118 /* cache-associativity config */ +#define MPAMCFG_MBW_MIN 0x0200 /* min mem-bw config */ +#define MPAMCFG_MBW_MAX 0x0208 /* max mem-bw config */ +#define MPAMCFG_MBW_WINWD 0x0220 /* mem-bw accounting window config */ +#define MPAMCFG_MBW_PBM 0x2000 /* mem-bw portion bitmap config */ +#define MPAMCFG_PRI 0x0400 /* priority partitioning config */ +#define MPAMCFG_MBW_PROP 0x0500 /* mem-bw stride config */ +#define MPAMCFG_INTPARTID 0x0600 /* partid-narrowing config */ + +#define MSMON_CFG_MON_SEL 0x0800 /* monitor selector */ +#define MSMON_CFG_CSU_FLT 0x0810 /* cache-usage monitor filter */ +#define MSMON_CFG_CSU_CTL 0x0818 /* cache-usage monitor config */ +#define MSMON_CFG_MBWU_FLT 0x0820 /* mem-bw monitor filter */ +#define MSMON_CFG_MBWU_CTL 0x0828 /* mem-bw monitor config */ +#define MSMON_CSU 0x0840 /* current cache-usage */ +#define MSMON_CSU_CAPTURE 0x0848 /* last cache-usage value captured */ +#define MSMON_MBWU 0x0860 /* current mem-bw usage value */ +#define MSMON_MBWU_CAPTURE 0x0868 /* last mem-bw value captured */ +#define MSMON_MBWU_L 0x0880 /* current long mem-bw usage value */ +#define MSMON_MBWU_L_CAPTURE 0x0890 /* last long mem-bw value captured */ +#define MSMON_CAPT_EVNT 0x0808 /* signal a capture event */ +#define MPAMF_ESR 0x00F8 /* error status register */ +#define MPAMF_ECR 0x00F0 /* error control register */ + +/* MPAMF_IDR - MPAM features ID register */ +#define MPAMF_IDR_PARTID_MAX GENMASK(15, 0) +#define MPAMF_IDR_PMG_MAX GENMASK(23, 16) +#define MPAMF_IDR_HAS_CCAP_PART BIT(24) +#define MPAMF_IDR_HAS_CPOR_PART BIT(25) +#define MPAMF_IDR_HAS_MBW_PART BIT(26) +#define MPAMF_IDR_HAS_PRI_PART BIT(27) +#define MPAMF_IDR_EXT BIT(28) +#define MPAMF_IDR_HAS_IMPL_IDR BIT(29) +#define MPAMF_IDR_HAS_MSMON BIT(30) +#define MPAMF_IDR_HAS_PARTID_NRW BIT(31) +#define MPAMF_IDR_HAS_RIS BIT(32) +#define MPAMF_IDR_HAS_EXTD_ESR BIT(38) +#define MPAMF_IDR_HAS_ESR BIT(39) +#define MPAMF_IDR_RIS_MAX GENMASK(59, 56) + +/* MPAMF_MSMON_IDR - MPAM performance monitoring ID register */ +#define MPAMF_MSMON_IDR_MSMON_CSU BIT(16) +#define MPAMF_MSMON_IDR_MSMON_MBWU BIT(17) +#define MPAMF_MSMON_IDR_HAS_LOCAL_CAPT_EVNT BIT(31) + +/* MPAMF_CPOR_IDR - MPAM features cache portion partitioning ID register */ +#define MPAMF_CPOR_IDR_CPBM_WD GENMASK(15, 0) + +/* MPAMF_CCAP_IDR - MPAM features cache capacity partitioning ID register */ +#define MPAMF_CCAP_IDR_CMAX_WD GENMASK(5, 0) +#define MPAMF_CCAP_IDR_CASSOC_WD GENMASK(12, 8) +#define MPAMF_CCAP_IDR_HAS_CASSOC BIT(28) +#define MPAMF_CCAP_IDR_HAS_CMIN BIT(29) +#define MPAMF_CCAP_IDR_NO_CMAX BIT(30) +#define MPAMF_CCAP_IDR_HAS_CMAX_SOFTLIM BIT(31) + +/* MPAMF_MBW_IDR - MPAM features memory bandwidth partitioning ID register */ +#define MPAMF_MBW_IDR_BWA_WD GENMASK(5, 0) +#define MPAMF_MBW_IDR_HAS_MIN BIT(10) +#define MPAMF_MBW_IDR_HAS_MAX BIT(11) +#define MPAMF_MBW_IDR_HAS_PBM BIT(12) +#define MPAMF_MBW_IDR_HAS_PROP BIT(13) +#define MPAMF_MBW_IDR_WINDWR BIT(14) +#define MPAMF_MBW_IDR_BWPBM_WD GENMASK(28, 16) + +/* MPAMF_PRI_IDR - MPAM features priority partitioning ID register */ +#define MPAMF_PRI_IDR_HAS_INTPRI BIT(0) +#define MPAMF_PRI_IDR_INTPRI_0_IS_LOW BIT(1) +#define MPAMF_PRI_IDR_INTPRI_WD GENMASK(9, 4) +#define MPAMF_PRI_IDR_HAS_DSPRI BIT(16) +#define MPAMF_PRI_IDR_DSPRI_0_IS_LOW BIT(17) +#define MPAMF_PRI_IDR_DSPRI_WD GENMASK(25, 20) + +/* MPAMF_CSUMON_IDR - MPAM cache storage usage monitor ID register */ +#define MPAMF_CSUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_CAPT BIT(24) +#define MPAMF_CSUMON_IDR_HAS_CEVNT_OFLW BIT(25) +#define MPAMF_CSUMON_IDR_HAS_OFSR BIT(26) +#define MPAMF_CSUMON_IDR_HAS_OFLOW_LNKG BIT(27) +#define MPAMF_CSUMON_IDR_HAS_XCL BIT(29) +#define MPAMF_CSUMON_IDR_CSU_RO BIT(30) +#define MPAMF_CSUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_MBWUMON_IDR - MPAM memory bandwidth usage monitor ID register */ +#define MPAMF_MBWUMON_IDR_NUM_MON GENMASK(15, 0) +#define MPAMF_MBWUMON_IDR_HAS_RWBW BIT(28) +#define MPAMF_MBWUMON_IDR_LWD BIT(29) +#define MPAMF_MBWUMON_IDR_HAS_LONG BIT(30) +#define MPAMF_MBWUMON_IDR_HAS_CAPTURE BIT(31) + +/* MPAMF_PARTID_NRW_IDR - MPAM PARTID narrowing ID register */ +#define MPAMF_PARTID_NRW_IDR_INTPARTID_MAX GENMASK(15, 0) + +/* MPAMF_IIDR - MPAM implementation ID register */ +#define MPAMF_IIDR_IMPLEMENTER GENMASK(11, 0) +#define MPAMF_IIDR_REVISION GENMASK(15, 12) +#define MPAMF_IIDR_VARIANT GENMASK(19, 16) +#define MPAMF_IIDR_PRODUCTID GENMASK(31, 20) + +/* MPAMF_AIDR - MPAM architecture ID register */ +#define MPAMF_AIDR_ARCH_MINOR_REV GENMASK(3, 0) +#define MPAMF_AIDR_ARCH_MAJOR_REV GENMASK(7, 4) + +/* MPAMCFG_PART_SEL - MPAM partition configuration selection register */ +#define MPAMCFG_PART_SEL_PARTID_SEL GENMASK(15, 0) +#define MPAMCFG_PART_SEL_INTERNAL BIT(16) +#define MPAMCFG_PART_SEL_RIS GENMASK(27, 24) + +/* MPAMCFG_CASSOC - MPAM cache maximum associativity partition configuration register */ +#define MPAMCFG_CASSOC_CASSOC GENMASK(15, 0) + +/* MPAMCFG_CMAX - MPAM cache capacity configuration register */ +#define MPAMCFG_CMAX_SOFTLIM BIT(31) +#define MPAMCFG_CMAX_CMAX GENMASK(15, 0) + +/* MPAMCFG_CMIN - MPAM cache capacity configuration register */ +#define MPAMCFG_CMIN_CMIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MIN - MPAM memory minimum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MIN_MIN GENMASK(15, 0) + +/* + * MPAMCFG_MBW_MAX - MPAM memory maximum bandwidth partitioning configuration + * register + */ +#define MPAMCFG_MBW_MAX_MAX GENMASK(15, 0) +#define MPAMCFG_MBW_MAX_HARDLIM BIT(31) + +/* + * MPAMCFG_MBW_WINWD - MPAM memory bandwidth partitioning window width + * register + */ +#define MPAMCFG_MBW_WINWD_US_FRAC GENMASK(7, 0) +#define MPAMCFG_MBW_WINWD_US_INT GENMASK(23, 8) + +/* MPAMCFG_PRI - MPAM priority partitioning configuration register */ +#define MPAMCFG_PRI_INTPRI GENMASK(15, 0) +#define MPAMCFG_PRI_DSPRI GENMASK(31, 16) + +/* + * MPAMCFG_MBW_PROP - Memory bandwidth proportional stride partitioning + * configuration register + */ +#define MPAMCFG_MBW_PROP_STRIDEM1 GENMASK(15, 0) +#define MPAMCFG_MBW_PROP_EN BIT(31) + +/* + * MPAMCFG_INTPARTID - MPAM internal partition narrowing configuration register + */ +#define MPAMCFG_INTPARTID_INTPARTID GENMASK(15, 0) +#define MPAMCFG_INTPARTID_INTERNAL BIT(16) + +/* MSMON_CFG_MON_SEL - Memory system performance monitor selection register */ +#define MSMON_CFG_MON_SEL_MON_SEL GENMASK(15, 0) +#define MSMON_CFG_MON_SEL_RIS GENMASK(27, 24) + +/* MPAMF_ESR - MPAM Error Status Register */ +#define MPAMF_ESR_PARTID_MON GENMASK(15, 0) +#define MPAMF_ESR_PMG GENMASK(23, 16) +#define MPAMF_ESR_ERRCODE GENMASK(27, 24) +#define MPAMF_ESR_OVRWR BIT(31) +#define MPAMF_ESR_RIS GENMASK(35, 32) + +/* MPAMF_ECR - MPAM Error Control Register */ +#define MPAMF_ECR_INTEN BIT(0) + +/* Error conditions in accessing memory mapped registers */ +#define MPAM_ERRCODE_NONE 0 +#define MPAM_ERRCODE_PARTID_SEL_RANGE 1 +#define MPAM_ERRCODE_REQ_PARTID_RANGE 2 +#define MPAM_ERRCODE_MSMONCFG_ID_RANGE 3 +#define MPAM_ERRCODE_REQ_PMG_RANGE 4 +#define MPAM_ERRCODE_MONITOR_RANGE 5 +#define MPAM_ERRCODE_INTPARTID_RANGE 6 +#define MPAM_ERRCODE_UNEXPECTED_INTERNAL 7 +#define MPAM_ERRCODE_UNDEFINED_RIS_PART_SEL 8 +#define MPAM_ERRCODE_RIS_NO_CONTROL 9 +#define MPAM_ERRCODE_UNDEFINED_RIS_MON_SEL 10 +#define MPAM_ERRCODE_RIS_NO_MONITOR 11 + +/* + * MSMON_CFG_CSU_CTL - Memory system performance monitor configure cache storage + * usage monitor control register + * MSMON_CFG_MBWU_CTL - Memory system performance monitor configure memory + * bandwidth usage monitor control register + */ +#define MSMON_CFG_x_CTL_TYPE GENMASK(7, 0) +#define MSMON_CFG_MBWU_CTL_OFLOW_STATUS_L BIT(15) +#define MSMON_CFG_x_CTL_MATCH_PARTID BIT(16) +#define MSMON_CFG_x_CTL_MATCH_PMG BIT(17) +#define MSMON_CFG_MBWU_CTL_SCLEN BIT(19) +#define MSMON_CFG_x_CTL_SUBTYPE GENMASK(22, 20) +#define MSMON_CFG_x_CTL_OFLOW_FRZ BIT(24) +#define MSMON_CFG_x_CTL_OFLOW_INTR BIT(25) +#define MSMON_CFG_x_CTL_OFLOW_STATUS BIT(26) +#define MSMON_CFG_x_CTL_CAPT_RESET BIT(27) +#define MSMON_CFG_x_CTL_CAPT_EVNT GENMASK(30, 28) +#define MSMON_CFG_x_CTL_EN BIT(31) + +#define MSMON_CFG_MBWU_CTL_TYPE_MBWU 0x42 +#define MSMON_CFG_CSU_CTL_TYPE_CSU 0x43 + +/* + * MSMON_CFG_CSU_FLT - Memory system performance monitor configure cache storage + * usage monitor filter register + * MSMON_CFG_MBWU_FLT - Memory system performance monitor configure memory + * bandwidth usage monitor filter register + */ +#define MSMON_CFG_x_FLT_PARTID GENMASK(15, 0) +#define MSMON_CFG_x_FLT_PMG GENMASK(23, 16) + +#define MSMON_CFG_MBWU_FLT_RWBW GENMASK(31, 30) +#define MSMON_CFG_CSU_FLT_XCL BIT(31) + +/* + * MSMON_CSU - Memory system performance monitor cache storage usage monitor + * register + * MSMON_CSU_CAPTURE - Memory system performance monitor cache storage usage + * capture register + * MSMON_MBWU - Memory system performance monitor memory bandwidth usage + * monitor register + * MSMON_MBWU_CAPTURE - Memory system performance monitor memory bandwidth usage + * capture register + */ +#define MSMON___VALUE GENMASK(30, 0) +#define MSMON___NRDY BIT(31) +#define MSMON___L_NRDY BIT(63) +#define MSMON___L_VALUE GENMASK(43, 0) +#define MSMON___LWD_VALUE GENMASK(62, 0) + +/* + * MSMON_CAPT_EVNT - Memory system performance monitoring capture event + * generation register + */ +#define MSMON_CAPT_EVNT_NOW BIT(0) + +#endif /* MPAM_INTERNAL_H */ diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c new file mode 100644 index 0000000000000..98942d95a3c95 --- /dev/null +++ b/drivers/resctrl/mpam_resctrl.c @@ -0,0 +1,1704 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. + +#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mpam_internal.h" + +DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters); + +/* + * The classes we've picked to map to resctrl resources, wrapped + * in with their resctrl structure. + * Class pointer may be NULL. + */ +static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES]; + +#define for_each_mpam_resctrl_control(res, rid) \ + for (rid = 0, res = &mpam_resctrl_controls[rid]; \ + rid < RDT_NUM_RESOURCES; \ + rid++, res = &mpam_resctrl_controls[rid]) + +/* + * The classes we've picked to map to resctrl events. + * Resctrl believes all the worlds a Xeon, and these are all on the L3. This + * array lets us find the actual class backing the event counters. e.g. + * the only memory bandwidth counters may be on the memory controller, but to + * make use of them, we pretend they are on L3. Restrict the events considered + * to those supported by MPAM. + * Class pointer may be NULL. + */ +#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID +static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1]; + +#define for_each_mpam_resctrl_mon(mon, eventid) \ + for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \ + eventid <= MPAM_MAX_EVENT; \ + eventid++, mon = &mpam_resctrl_counters[eventid]) + +/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */ +static DEFINE_MUTEX(domain_list_lock); + +/* + * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1. + * This applies globally to all traffic the CPU generates. + */ +static bool cdp_enabled; + +/* + * We use cacheinfo to discover the size of the caches and their id. cacheinfo + * populates this from a device_initcall(). mpam_resctrl_setup() must wait. + */ +static bool cacheinfo_ready; +static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready); + +/* + * If resctrl_init() succeeded, resctrl_exit() can be used to remove support + * for the filesystem in the event of an error. + */ +static bool resctrl_enabled; + +bool resctrl_arch_alloc_capable(void) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + for_each_mpam_resctrl_control(res, rid) { + if (res->resctrl_res.alloc_capable) + return true; + } + + return false; +} + +bool resctrl_arch_mon_capable(void) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + /* All monitors are presented as being on the L3 cache */ + return l3->mon_capable; +} + +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + return false; +} + +void resctrl_arch_mon_event_config_read(void *info) +{ +} + +void resctrl_arch_mon_event_config_write(void *info) +{ +} + +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) +{ +} + +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid) +{ +} + +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ +} + +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, + u32 unused, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return false; +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + return -EINVAL; +} + +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable) +{ + return -EOPNOTSUPP; +} + +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r) +{ + return false; +} + +void resctrl_arch_pre_mount(void) +{ +} + +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid) +{ + return mpam_resctrl_controls[rid].cdp_enabled; +} + +/** + * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks. + * + * At boot, all existing tasks use partid zero for D and I. + * To enable/disable CDP emulation, all these tasks need relabelling. + */ +static void resctrl_reset_task_closids(void) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + } + read_unlock(&tasklist_lock); +} + +int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable) +{ + u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID; + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + int cpu; + + if (!IS_ENABLED(CONFIG_EXPERT) && enable) { + /* + * If the resctrl fs is mounted more than once, sequentially, + * then CDP can lead to the use of out of range PARTIDs. + */ + pr_warn("CDP not supported\n"); + return -EOPNOTSUPP; + } + + if (enable) + pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n"); + + /* + * resctrl_arch_set_cdp_enabled() is only called with enable set to + * false on error and unmount. + */ + cdp_enabled = enable; + mpam_resctrl_controls[rid].cdp_enabled = enable; + + if (enable) + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2; + else + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */ + if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false; + + if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled && + mpam_resctrl_controls[RDT_RESOURCE_MBA].class) + mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true; + + if (enable) { + if (mpam_partid_max < 1) + return -EINVAL; + + partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA); + partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE); + } + + mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0); + WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current)); + + resctrl_reset_task_closids(); + + for_each_possible_cpu(cpu) + mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0); + on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1); + + return 0; +} + +static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid) +{ + return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid); +} + +/* + * MSC may raise an error interrupt if it sees an out or range partid/pmg, + * and go on to truncate the value. Regardless of what the hardware supports, + * only the system wide safe value is safe to use. + */ +u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored) +{ + return mpam_partid_max + 1; +} + +u32 resctrl_arch_system_num_rmid_idx(void) +{ + return (mpam_pmg_max + 1) * (mpam_partid_max + 1); +} + +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid) +{ + return closid * (mpam_pmg_max + 1) + rmid; +} + +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid) +{ + *closid = idx / (mpam_pmg_max + 1); + *rmid = idx % (mpam_pmg_max + 1); +} + +void resctrl_arch_sched_in(struct task_struct *tsk) +{ + lockdep_assert_preemption_disabled(); + + mpam_thread_switch(tsk); +} + +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid); + } else { + /* + * When CDP is enabled, resctrl halves the closid range and we + * use odd/even partid for one closid. + */ + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid); + } +} + +void resctrl_arch_sync_cpu_closid_rmid(void *info) +{ + struct resctrl_cpu_defaults *r = info; + + lockdep_assert_preemption_disabled(); + + if (r) { + resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(), + r->closid, r->rmid); + } + + resctrl_arch_sched_in(current); +} + +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + WARN_ON_ONCE(closid > U16_MAX); + WARN_ON_ONCE(rmid > U8_MAX); + + if (!cdp_enabled) { + mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid); + } else { + u32 partid_d = resctrl_get_config_index(closid, CDP_DATA); + u32 partid_i = resctrl_get_config_index(closid, CDP_CODE); + + mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid); + } +} + +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return tsk_closid == closid; +} + +/* The task's pmg is not unique, the partid must be considered too */ +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid) +{ + u64 regval = mpam_get_regval(tsk); + u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval); + u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval); + + if (cdp_enabled) + tsk_closid >>= 1; + + return (tsk_closid == closid) && (tsk_rmid == rmid); +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &mpam_resctrl_controls[l].resctrl_res; +} + +static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mpam_is_enabled()) + return -EINVAL; + + if (!mon->class) + return -EINVAL; + + switch (evtid) { + case QOS_L3_OCCUP_EVENT_ID: + /* With CDP, one monitor gets used for both code/data reads */ + return mpam_alloc_csu_mon(mon->class); + case QOS_L3_MBM_LOCAL_EVENT_ID: + case QOS_L3_MBM_TOTAL_EVENT_ID: + return USE_PRE_ALLOCATED; + default: + return -EOPNOTSUPP; + } +} + +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) +{ + DEFINE_WAIT(wait); + int *ret; + + ret = kmalloc_obj(*ret, GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + + do { + prepare_to_wait(&resctrl_mon_ctx_waiters, &wait, + TASK_INTERRUPTIBLE); + *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid); + if (*ret == -ENOSPC) + schedule(); + } while (*ret == -ENOSPC && !signal_pending(current)); + finish_wait(&resctrl_mon_ctx_waiters, &wait); + + return ret; +} + +static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid, + u32 mon_idx) +{ + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid]; + + if (!mpam_is_enabled()) + return; + + if (!mon->class) + return; + + if (evtid == QOS_L3_OCCUP_EVENT_ID) + mpam_free_csu_mon(mon->class, mon_idx); + + wake_up(&resctrl_mon_ctx_waiters); +} + +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, void *arch_mon_ctx) +{ + u32 mon_idx = *(u32 *)arch_mon_ctx; + + kfree(arch_mon_ctx); + + resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx); +} + +static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, + enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val) +{ + struct mon_cfg cfg; + + if (!mpam_is_enabled()) + return -EINVAL; + + /* Shift closid to account for CDP */ + closid = resctrl_get_config_index(closid, cdp_type); + + if (irqs_disabled()) { + /* Check if we can access this domain without an IPI */ + return -EIO; + } + + cfg = (struct mon_cfg) { + .mon = mon_idx, + .match_pmg = true, + .partid = closid, + .pmg = rmid, + }; + + return mpam_msmon_read(mon_comp, &cfg, mon_type, val); +} + +static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp, + enum mpam_device_features mon_type, + int mon_idx, u32 closid, u32 rmid, u64 *val) +{ + if (cdp_enabled) { + u64 code_val = 0, data_val = 0; + int err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_CODE, closid, rmid, &code_val); + if (err) + return err; + + err = __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_DATA, closid, rmid, &data_val); + if (err) + return err; + + *val += code_val + data_val; + return 0; + } + + return __read_mon(mon, mon_comp, mon_type, mon_idx, + CDP_NONE, closid, rmid, val); +} + +/* MBWU when not in ABMC mode (not supported), and CSU counters. */ +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, + u32 closid, u32 rmid, enum resctrl_event_id eventid, + void *arch_priv, u64 *val, void *arch_mon_ctx) +{ + struct mpam_resctrl_dom *l3_dom; + struct mpam_component *mon_comp; + u32 mon_idx = *(u32 *)arch_mon_ctx; + enum mpam_device_features mon_type; + struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid]; + + resctrl_arch_rmid_read_context_check(); + + if (!mpam_is_enabled()) + return -EINVAL; + + if (eventid >= QOS_NUM_EVENTS || !mon->class) + return -EINVAL; + + l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr); + mon_comp = l3_dom->mon_comp[eventid]; + + if (eventid != QOS_L3_OCCUP_EVENT_ID) + return -EINVAL; + + mon_type = mpam_feat_msmon_csu; + + return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx, + closid, rmid, val); +} + +/* + * The rmid realloc threshold should be for the smallest cache exposed to + * resctrl. + */ +static int update_rmid_limits(struct mpam_class *class) +{ + u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx(); + struct mpam_props *cprops = &class->props; + struct cacheinfo *ci; + + lockdep_assert_cpus_held(); + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return 0; + + /* + * Assume cache levels are the same size for all CPUs... + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level); + if (!ci || ci->size == 0) { + pr_debug("Could not read cache size for class %u\n", + class->level); + return -EINVAL; + } + + if (!resctrl_rmid_realloc_limit || + ci->size < resctrl_rmid_realloc_limit) { + resctrl_rmid_realloc_limit = ci->size; + resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg; + } + + return 0; +} + +static bool cache_has_usable_cpor(struct mpam_class *class) +{ + struct mpam_props *cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_cpor_part, cprops)) + return false; + + /* resctrl uses u32 for all bitmap configurations */ + return class->props.cpbm_wd <= 32; +} + +static bool mba_class_use_mbw_max(struct mpam_props *cprops) +{ + return (mpam_has_feature(mpam_feat_mbw_max, cprops) && + cprops->bwa_wd); +} + +static bool class_has_usable_mba(struct mpam_props *cprops) +{ + return mba_class_use_mbw_max(cprops); +} + +static bool cache_has_usable_csu(struct mpam_class *class) +{ + struct mpam_props *cprops; + + if (!class) + return false; + + cprops = &class->props; + + if (!mpam_has_feature(mpam_feat_msmon_csu, cprops)) + return false; + + /* + * CSU counters settle on the value, so we can get away with + * having only one. + */ + if (!cprops->num_csu_mon) + return false; + + return true; +} + +/* + * Calculate the worst-case percentage change from each implemented step + * in the control. + */ +static u32 get_mba_granularity(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) + return 0; + + /* + * bwa_wd is the number of bits implemented in the 0.xxx + * fixed point fraction. 1 bit is 50%, 2 is 25% etc. + */ + return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd); +} + +/* + * Each fixed-point hardware value architecturally represents a range + * of values: the full range 0% - 100% is split contiguously into + * (1 << cprops->bwa_wd) equal bands. + * + * Although the bwa_bwd fields have 6 bits the maximum valid value is 16 + * as it reports the width of fields that are at most 16 bits. When + * fewer than 16 bits are valid the least significant bits are + * ignored. The implied binary point is kept between bits 15 and 16 and + * so the valid bits are leftmost. + * + * See ARM IHI0099B.a "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format" for more information. + * + * Find the nearest percentage value to the upper bound of the selected band: + */ +static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops) +{ + u32 val = mbw_max; + + val >>= 16 - cprops->bwa_wd; + val += 1; + val *= MAX_MBA_BW; + val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd); + + return val; +} + +/* + * Find the band whose upper bound is closest to the specified percentage. + * + * A round-to-nearest policy is followed here as a balanced compromise + * between unexpected under-commit of the resource (where the total of + * a set of resource allocations after conversion is less than the + * expected total, due to rounding of the individual converted + * percentages) and over-commit (where the total of the converted + * allocations is greater than expected). + */ +static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops) +{ + u32 val = pc; + + val <<= cprops->bwa_wd; + val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW); + val = max(val, 1) - 1; + val <<= 16 - cprops->bwa_wd; + + return val; +} + +static u32 get_mba_min(struct mpam_props *cprops) +{ + if (!mba_class_use_mbw_max(cprops)) { + WARN_ON_ONCE(1); + return 0; + } + + return mbw_max_to_percent(0, cprops); +} + +/* Find the L3 cache that has affinity with this CPU */ +static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask) +{ + u32 cache_id = get_cpu_cacheinfo_id(cpu, 3); + + lockdep_assert_cpus_held(); + + return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask); +} + +/* + * topology_matches_l3() - Is the provided class the same shape as L3 + * @victim: The class we'd like to pretend is L3. + * + * resctrl expects all the world's a Xeon, and all counters are on the + * L3. We allow some mapping counters on other classes. This requires + * that the CPU->domain mapping is the same kind of shape. + * + * Using cacheinfo directly would make this work even if resctrl can't + * use the L3 - but cacheinfo can't tell us anything about offline CPUs. + * Using the L3 resctrl domain list also depends on CPUs being online. + * Using the mpam_class we picked for L3 so we can use its domain list + * assumes that there are MPAM controls on the L3. + * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id() + * helper which can tell us about offline CPUs ... but getting the cache_id + * to start with relies on at least one CPU per L3 cache being online at + * boot. + * + * Walk the victim component list and compare the affinity mask with the + * corresponding L3. The topology matches if each victim:component's affinity + * mask is the same as the CPU's corresponding L3's. These lists/masks are + * computed from firmware tables so don't change at runtime. + */ +static bool topology_matches_l3(struct mpam_class *victim) +{ + int cpu, err; + struct mpam_component *victim_iter; + + lockdep_assert_cpus_held(); + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) + return false; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(victim_iter, &victim->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_empty(&victim_iter->affinity)) { + pr_debug("class %u has CPU-less component %u - can't match L3!\n", + victim->level, victim_iter->comp_id); + return false; + } + + cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask); + if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + return false; + + cpumask_clear(tmp_cpumask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3's equivalent component to class %u component %u\n", + victim->level, victim_iter->comp_id); + return false; + } + + /* Any differing bits in the affinity mask? */ + if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) { + pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n" + "L3:%*pbl != victim:%*pbl\n", + victim->level, victim_iter->comp_id, + cpumask_pr_args(tmp_cpumask), + cpumask_pr_args(&victim_iter->affinity)); + + return false; + } + } + + return true; +} + +/* + * Test if the traffic for a class matches that at egress from the L3. For + * MSC at memory controllers this is only possible if there is a single L3 + * as otherwise the counters at the memory can include bandwidth from the + * non-local L3. + */ +static bool traffic_matches_l3(struct mpam_class *class) +{ + int err, cpu; + + lockdep_assert_cpus_held(); + + if (class->type == MPAM_CLASS_CACHE && class->level == 3) + return true; + + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a different cache from L3\n", class->level); + return false; + } + + if (class->type != MPAM_CLASS_MEMORY) { + pr_debug("class %u is neither of type cache or memory\n", class->level); + return false; + } + + cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL; + if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) { + pr_debug("cpumask allocation failed\n"); + return false; + } + + cpu = cpumask_any_and(&class->affinity, cpu_online_mask); + err = find_l3_equivalent_bitmask(cpu, tmp_cpumask); + if (err) { + pr_debug("Failed to find L3 downstream to cpu %d\n", cpu); + return false; + } + + if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) { + pr_debug("There is more than one L3\n"); + return false; + } + + /* Be strict; the traffic might stop in the intermediate cache. */ + if (get_cpu_cacheinfo_id(cpu, 4) != -1) { + pr_debug("L3 isn't the last level of cache\n"); + return false; + } + + if (num_possible_nodes() > 1) { + pr_debug("There is more than one numa node\n"); + return false; + } + +#ifdef CONFIG_HMEM_REPORTING + if (node_devices[cpu_to_node(cpu)]->cache_dev) { + pr_debug("There is a memory side cache\n"); + return false; + } +#endif + + return true; +} + +/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */ +static void mpam_resctrl_pick_caches(void) +{ + struct mpam_class *class; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + if (class->type != MPAM_CLASS_CACHE) { + pr_debug("class %u is not a cache\n", class->level); + continue; + } + + if (class->level != 2 && class->level != 3) { + pr_debug("class %u is not L2 or L3\n", class->level); + continue; + } + + if (!cache_has_usable_cpor(class)) { + pr_debug("class %u cache misses CPOR\n", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level, + cpumask_pr_args(&class->affinity), + cpumask_pr_args(cpu_possible_mask)); + continue; + } + + if (class->level == 2) + res = &mpam_resctrl_controls[RDT_RESOURCE_L2]; + else + res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + res->class = class; + } +} + +static void mpam_resctrl_pick_mba(void) +{ + struct mpam_class *class, *candidate_class = NULL; + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + struct mpam_props *cprops = &class->props; + + if (class->level != 3 && class->type == MPAM_CLASS_CACHE) { + pr_debug("class %u is a cache but not the L3\n", class->level); + continue; + } + + if (!class_has_usable_mba(cprops)) { + pr_debug("class %u has no bandwidth control\n", + class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u has missing CPUs\n", class->level); + continue; + } + + if (!topology_matches_l3(class)) { + pr_debug("class %u topology doesn't match L3\n", + class->level); + continue; + } + + if (!traffic_matches_l3(class)) { + pr_debug("class %u traffic doesn't match L3 egress\n", + class->level); + continue; + } + + /* + * Pick a resource to be MBA that as close as possible to + * the L3. mbm_total counts the bandwidth leaving the L3 + * cache and MBA should correspond as closely as possible + * for proper operation of mba_sc. + */ + if (!candidate_class || class->level < candidate_class->level) + candidate_class = class; + } + + if (candidate_class) { + pr_debug("selected class %u to back MBA\n", + candidate_class->level); + res = &mpam_resctrl_controls[RDT_RESOURCE_MBA]; + res->class = candidate_class; + } +} + +static void counter_update_class(enum resctrl_event_id evt_id, + struct mpam_class *class) +{ + struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class; + + if (existing_class) { + if (class->level == 3) { + pr_debug("Existing class is L3 - L3 wins\n"); + return; + } + + if (existing_class->level < class->level) { + pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n", + existing_class->level, class->level); + return; + } + } + + mpam_resctrl_counters[evt_id].class = class; +} + +static void mpam_resctrl_pick_counters(void) +{ + struct mpam_class *class; + + lockdep_assert_cpus_held(); + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(class, &mpam_classes, classes_list, + srcu_read_lock_held(&mpam_srcu)) { + /* The name of the resource is L3... */ + if (class->type == MPAM_CLASS_CACHE && class->level != 3) { + pr_debug("class %u is a cache but not the L3", class->level); + continue; + } + + if (!cpumask_equal(&class->affinity, cpu_possible_mask)) { + pr_debug("class %u does not cover all CPUs", + class->level); + continue; + } + + if (cache_has_usable_csu(class)) { + pr_debug("class %u has usable CSU", + class->level); + + /* CSU counters only make sense on a cache. */ + switch (class->type) { + case MPAM_CLASS_CACHE: + if (update_rmid_limits(class)) + break; + + counter_update_class(QOS_L3_OCCUP_EVENT_ID, class); + break; + default: + break; + } + } + } +} + +static int mpam_resctrl_control_init(struct mpam_resctrl_res *res) +{ + struct mpam_class *class = res->class; + struct mpam_props *cprops = &class->props; + struct rdt_resource *r = &res->resctrl_res; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + r->schema_fmt = RESCTRL_SCHEMA_BITMAP; + r->cache.arch_has_sparse_bitmasks = true; + + r->cache.cbm_len = class->props.cpbm_wd; + /* mpam_devices will reject empty bitmaps */ + r->cache.min_cbm_bits = 1; + + if (r->rid == RDT_RESOURCE_L2) { + r->name = "L2"; + r->ctrl_scope = RESCTRL_L2_CACHE; + r->cdp_capable = true; + } else { + r->name = "L3"; + r->ctrl_scope = RESCTRL_L3_CACHE; + r->cdp_capable = true; + } + + /* + * Which bits are shared with other ...things... Unknown + * devices use partid-0 which uses all the bitmap fields. Until + * we have configured the SMMU and GIC not to do this 'all the + * bits' is the correct answer here. + */ + r->cache.shareable_bits = resctrl_get_default_ctrl(r); + r->alloc_capable = true; + break; + case RDT_RESOURCE_MBA: + r->schema_fmt = RESCTRL_SCHEMA_RANGE; + r->ctrl_scope = RESCTRL_L3_CACHE; + + r->membw.delay_linear = true; + r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; + r->membw.min_bw = get_mba_min(cprops); + r->membw.max_bw = MAX_MBA_BW; + r->membw.bw_gran = get_mba_granularity(cprops); + + r->name = "MB"; + r->alloc_capable = true; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp) +{ + struct mpam_class *class = comp->class; + + if (class->type == MPAM_CLASS_CACHE) + return comp->comp_id; + + if (topology_matches_l3(class)) { + /* Use the corresponding L3 component ID as the domain ID */ + int id = get_cpu_cacheinfo_id(cpu, 3); + + /* Implies topology_matches_l3() made a mistake */ + if (WARN_ON_ONCE(id == -1)) + return comp->comp_id; + + return id; + } + + /* Otherwise, expose the ID used by the firmware table code. */ + return comp->comp_id; +} + +static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon, + enum resctrl_event_id type) +{ + struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + struct rdt_resource *l3 = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + /* + * There also needs to be an L3 cache present. + * The check just requires any online CPU and it can't go offline as we + * hold the cpu lock. + */ + if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1) + return 0; + + /* + * If there are no MPAM resources on L3, force it into existence. + * topology_matches_l3() already ensures this looks like the L3. + * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init(). + */ + if (!res->class) { + pr_warn_once("Faking L3 MSC to enable counters.\n"); + res->class = mpam_resctrl_counters[type].class; + } + + /* + * Called multiple times!, once per event type that has a + * monitoring class. + * Setting name is necessary on monitor only platforms. + */ + l3->name = "L3"; + l3->mon_scope = RESCTRL_L3_CACHE; + + /* + * num-rmid is the upper bound for the number of monitoring groups that + * can exist simultaneously, including the default monitoring group for + * each control group. Hence, advertise the whole rmid_idx space even + * though each control group has its own pmg/rmid space. Unfortunately, + * this does mean userspace needs to know the architecture to correctly + * interpret this value. + */ + l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx(); + + if (resctrl_enable_mon_event(type, false, 0, NULL)) + l3->mon_capable = true; + + return 0; +} + +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type type) +{ + u32 partid; + struct mpam_config *cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + enum mpam_device_features configured_by; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return resctrl_get_default_ctrl(r); + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + /* + * When CDP is enabled, but the resource doesn't support it, + * the control is cloned across both partids. + * Pick one at random to read: + */ + if (mpam_resctrl_hide_cdp(r->rid)) + type = CDP_DATA; + + partid = resctrl_get_config_index(closid, type); + cfg = &dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + configured_by = mpam_feat_cpor_part; + break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + configured_by = mpam_feat_mbw_max; + break; + } + fallthrough; + default: + return resctrl_get_default_ctrl(r); + } + + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) || + !mpam_has_feature(configured_by, cfg)) + return resctrl_get_default_ctrl(r); + + switch (configured_by) { + case mpam_feat_cpor_part: + return cfg->cpbm; + case mpam_feat_mbw_max: + return mbw_max_to_percent(cfg->mbw_max, cprops); + default: + return resctrl_get_default_ctrl(r); + } +} + +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, + u32 closid, enum resctrl_conf_type t, u32 cfg_val) +{ + int err; + u32 partid; + struct mpam_config cfg; + struct mpam_props *cprops; + struct mpam_resctrl_res *res; + struct mpam_resctrl_dom *dom; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + if (!mpam_is_enabled()) + return -EINVAL; + + /* + * No need to check the CPU as mpam_apply_config() doesn't care, and + * resctrl_arch_update_domains() relies on this. + */ + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom); + cprops = &res->class->props; + + if (mpam_resctrl_hide_cdp(r->rid)) + t = CDP_DATA; + + partid = resctrl_get_config_index(closid, t); + if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) { + pr_debug("Not alloc capable or computed PARTID out of range\n"); + return -EINVAL; + } + + /* + * Copy the current config to avoid clearing other resources when the + * same component is exposed multiple times through resctrl. + */ + cfg = dom->ctrl_comp->cfg[partid]; + + switch (r->rid) { + case RDT_RESOURCE_L2: + case RDT_RESOURCE_L3: + cfg.cpbm = cfg_val; + mpam_set_feature(mpam_feat_cpor_part, &cfg); + break; + case RDT_RESOURCE_MBA: + if (mpam_has_feature(mpam_feat_mbw_max, cprops)) { + cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops); + mpam_set_feature(mpam_feat_mbw_max, &cfg); + break; + } + fallthrough; + default: + return -EINVAL; + } + + /* + * When CDP is enabled, but the resource doesn't support it, we need to + * apply the same configuration to the other partid. + */ + if (mpam_resctrl_hide_cdp(r->rid)) { + partid = resctrl_get_config_index(closid, CDP_CODE); + err = mpam_apply_config(dom->ctrl_comp, partid, &cfg); + if (err) + return err; + + partid = resctrl_get_config_index(closid, CDP_DATA); + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); + } + + return mpam_apply_config(dom->ctrl_comp, partid, &cfg); +} + +int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) +{ + int err; + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + lockdep_assert_irqs_enabled(); + + if (!mpam_is_enabled()) + return -EINVAL; + + list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) { + for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) { + struct resctrl_staged_config *cfg = &d->staged_config[t]; + + if (!cfg->have_new_ctrl) + continue; + + err = resctrl_arch_update_one(r, d, closid, t, + cfg->new_ctrl); + if (err) + return err; + } + } + + return 0; +} + +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) +{ + struct mpam_resctrl_res *res; + + lockdep_assert_cpus_held(); + + if (!mpam_is_enabled()) + return; + + res = container_of(r, struct mpam_resctrl_res, resctrl_res); + mpam_reset_class_locked(res->class); +} + +static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp, + enum resctrl_res_level rid, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + INIT_LIST_HEAD(&hdr->list); + hdr->id = mpam_resctrl_pick_domain_id(cpu, comp); + hdr->rid = rid; + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +static void mpam_resctrl_online_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_cpus_held(); + + cpumask_set_cpu(cpu, &hdr->cpu_mask); +} + +/** + * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU. + * @cpu: The CPU to remove from the domain. + * @hdr: The domain's header. + * + * Removes @cpu from the header mask. If this was the last CPU in the domain, + * the domain header is removed from its parent list and true is returned, + * indicating the parent structure can be freed. + * If there are other CPUs in the domain, returns false. + */ +static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu, + struct rdt_domain_hdr *hdr) +{ + lockdep_assert_held(&domain_list_lock); + + cpumask_clear_cpu(cpu, &hdr->cpu_mask); + if (cpumask_empty(&hdr->cpu_mask)) { + list_del_rcu(&hdr->list); + synchronize_rcu(); + return true; + } + + return false; +} + +static void mpam_resctrl_domain_insert(struct list_head *list, + struct rdt_domain_hdr *new) +{ + struct rdt_domain_hdr *err; + struct list_head *pos = NULL; + + lockdep_assert_held(&domain_list_lock); + + err = resctrl_find_domain(list, new->id, &pos); + if (WARN_ON_ONCE(err)) + return; + + list_add_tail_rcu(&new->list, pos); +} + +static struct mpam_component *find_component(struct mpam_class *class, int cpu) +{ + struct mpam_component *comp; + + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp->affinity)) + return comp; + } + + return NULL; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res) +{ + int err; + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + struct mpam_class *class = res->class; + struct mpam_component *comp_iter, *ctrl_comp; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_held(&domain_list_lock); + + ctrl_comp = NULL; + guard(srcu)(&mpam_srcu); + list_for_each_entry_srcu(comp_iter, &class->components, class_list, + srcu_read_lock_held(&mpam_srcu)) { + if (cpumask_test_cpu(cpu, &comp_iter->affinity)) { + ctrl_comp = comp_iter; + break; + } + } + + /* class has no component for this CPU */ + if (WARN_ON_ONCE(!ctrl_comp)) + return ERR_PTR(-EINVAL); + + dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!dom) + return ERR_PTR(-ENOMEM); + + if (r->alloc_capable) { + dom->ctrl_comp = ctrl_comp; + + ctrl_d = &dom->resctrl_ctrl_dom; + mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr); + ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN; + err = resctrl_online_ctrl_domain(r, ctrl_d); + if (err) + goto free_domain; + + mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr); + } else { + pr_debug("Skipped control domain online - no controls\n"); + } + + if (r->mon_capable) { + struct mpam_component *any_mon_comp; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + /* + * Even if the monitor domain is backed by a different + * component, the L3 component IDs need to be used... only + * there may be no ctrl_comp for the L3. + * Search each event's class list for a component with + * overlapping CPUs and set up the dom->mon_comp array. + */ + + for_each_mpam_resctrl_mon(mon, eventid) { + struct mpam_component *mon_comp; + + if (!mon->class) + continue; // dummy resource + + mon_comp = find_component(mon->class, cpu); + dom->mon_comp[eventid] = mon_comp; + if (mon_comp) + any_mon_comp = mon_comp; + } + if (!any_mon_comp) { + WARN_ON_ONCE(0); + err = -EFAULT; + goto offline_ctrl_domain; + } + + mon_d = &dom->resctrl_mon_dom; + mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr); + mon_d->hdr.type = RESCTRL_MON_DOMAIN; + err = resctrl_online_mon_domain(r, &mon_d->hdr); + if (err) + goto offline_ctrl_domain; + + mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr); + } else { + pr_debug("Skipped monitor domain online - no monitors\n"); + } + + return dom; + +offline_ctrl_domain: + if (r->alloc_capable) { + mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + resctrl_offline_ctrl_domain(r, ctrl_d); + } +free_domain: + kfree(dom); + dom = ERR_PTR(err); + + return dom; +} + +/* + * We know all the monitors are associated with the L3, even if there are no + * controls and therefore no control component. Find the cache-id for the CPU + * and use that to search for existing resctrl domains. + * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id + * for anything that is not a cache. + */ +static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu) +{ + int cache_id; + struct mpam_resctrl_dom *dom; + struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3]; + + lockdep_assert_cpus_held(); + + if (!l3->class) + return NULL; + cache_id = get_cpu_cacheinfo_id(cpu, 3); + if (cache_id < 0) + return NULL; + + list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) { + if (dom->resctrl_mon_dom.hdr.id == cache_id) + return dom; + } + + return NULL; +} + +static struct mpam_resctrl_dom * +mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res) +{ + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + lockdep_assert_cpus_held(); + + list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) { + if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity)) + return dom; + } + + if (r->rid != RDT_RESOURCE_L3) + return NULL; + + /* Search the mon domain list too - needed on monitor only platforms. */ + return mpam_resctrl_get_mon_domain_from_cpu(cpu); +} + +int mpam_resctrl_online_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy_resource; + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (!dom) { + dom = mpam_resctrl_alloc_domain(cpu, res); + if (IS_ERR(dom)) + return PTR_ERR(dom); + } else { + if (r->alloc_capable) { + struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom; + + mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr); + } + if (r->mon_capable) { + struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom; + + mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr); + } + } + } + + resctrl_online_cpu(cpu); + + return 0; +} + +void mpam_resctrl_offline_cpu(unsigned int cpu) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + + resctrl_offline_cpu(cpu); + + guard(mutex)(&domain_list_lock); + for_each_mpam_resctrl_control(res, rid) { + struct mpam_resctrl_dom *dom; + struct rdt_l3_mon_domain *mon_d; + struct rdt_ctrl_domain *ctrl_d; + bool ctrl_dom_empty, mon_dom_empty; + struct rdt_resource *r = &res->resctrl_res; + + if (!res->class) + continue; // dummy resource + + dom = mpam_resctrl_get_domain_from_cpu(cpu, res); + if (WARN_ON_ONCE(!dom)) + continue; + + if (r->alloc_capable) { + ctrl_d = &dom->resctrl_ctrl_dom; + ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr); + if (ctrl_dom_empty) + resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d); + } else { + ctrl_dom_empty = true; + } + + if (r->mon_capable) { + mon_d = &dom->resctrl_mon_dom; + mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr); + if (mon_dom_empty) + resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr); + } else { + mon_dom_empty = true; + } + + if (ctrl_dom_empty && mon_dom_empty) + kfree(dom); + } +} + +int mpam_resctrl_setup(void) +{ + int err = 0; + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + wait_event(wait_cacheinfo_ready, cacheinfo_ready); + + cpus_read_lock(); + for_each_mpam_resctrl_control(res, rid) { + INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains); + INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains); + res->resctrl_res.rid = rid; + } + + /* Find some classes to use for controls */ + mpam_resctrl_pick_caches(); + mpam_resctrl_pick_mba(); + + /* Initialise the resctrl structures from the classes */ + for_each_mpam_resctrl_control(res, rid) { + if (!res->class) + continue; // dummy resource + + err = mpam_resctrl_control_init(res); + if (err) { + pr_debug("Failed to initialise rid %u\n", rid); + goto internal_error; + } + } + + /* Find some classes to use for monitors */ + mpam_resctrl_pick_counters(); + + for_each_mpam_resctrl_mon(mon, eventid) { + if (!mon->class) + continue; // dummy resource + + err = mpam_resctrl_monitor_init(mon, eventid); + if (err) { + pr_debug("Failed to initialise event %u\n", eventid); + goto internal_error; + } + } + + cpus_read_unlock(); + + if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) { + pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n", + resctrl_arch_alloc_capable(), resctrl_arch_mon_capable()); + return -EOPNOTSUPP; + } + + err = resctrl_init(); + if (err) + return err; + + WRITE_ONCE(resctrl_enabled, true); + + return 0; + +internal_error: + cpus_read_unlock(); + pr_debug("Internal error %d - resctrl not supported\n", err); + return err; +} + +void mpam_resctrl_exit(void) +{ + if (!READ_ONCE(resctrl_enabled)) + return; + + WRITE_ONCE(resctrl_enabled, false); + resctrl_exit(); +} + +/* + * The driver is detaching an MSC from this class, if resctrl was using it, + * pull on resctrl_exit(). + */ +void mpam_resctrl_teardown_class(struct mpam_class *class) +{ + struct mpam_resctrl_res *res; + enum resctrl_res_level rid; + struct mpam_resctrl_mon *mon; + enum resctrl_event_id eventid; + + might_sleep(); + + for_each_mpam_resctrl_control(res, rid) { + if (res->class == class) { + res->class = NULL; + break; + } + } + for_each_mpam_resctrl_mon(mon, eventid) { + if (mon->class == class) { + mon->class = NULL; + break; + } + } +} + +static int __init __cacheinfo_ready(void) +{ + cacheinfo_ready = true; + wake_up(&wait_cacheinfo_ready); + + return 0; +} +device_initcall_sync(__cacheinfo_ready); + +#ifdef CONFIG_MPAM_KUNIT_TEST +#include "test_mpam_resctrl.c" +#endif diff --git a/drivers/resctrl/test_mpam_devices.c b/drivers/resctrl/test_mpam_devices.c new file mode 100644 index 0000000000000..3e8d564a0c647 --- /dev/null +++ b/drivers/resctrl/test_mpam_devices.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_devices.c */ + +#include + +/* + * This test catches fields that aren't being sanitised - but can't tell you + * which one... + */ +static void test__props_mismatch(struct kunit *test) +{ + struct mpam_props parent = { 0 }; + struct mpam_props child; + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, false); + + memset(&child, 0, sizeof(child)); + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); + + memset(&child, 0xff, sizeof(child)); + __props_mismatch(&parent, &child, true); + + KUNIT_EXPECT_EQ(test, memcmp(&parent, &child, sizeof(child)), 0); +} + +static struct list_head fake_classes_list; +static struct mpam_class fake_class = { 0 }; +static struct mpam_component fake_comp1 = { 0 }; +static struct mpam_component fake_comp2 = { 0 }; +static struct mpam_vmsc fake_vmsc1 = { 0 }; +static struct mpam_vmsc fake_vmsc2 = { 0 }; +static struct mpam_msc fake_msc1 = { 0 }; +static struct mpam_msc fake_msc2 = { 0 }; +static struct mpam_msc_ris fake_ris1 = { 0 }; +static struct mpam_msc_ris fake_ris2 = { 0 }; +static struct platform_device fake_pdev = { 0 }; + +static inline void reset_fake_hierarchy(void) +{ + INIT_LIST_HEAD(&fake_classes_list); + + memset(&fake_class, 0, sizeof(fake_class)); + fake_class.level = 3; + fake_class.type = MPAM_CLASS_CACHE; + INIT_LIST_HEAD_RCU(&fake_class.components); + INIT_LIST_HEAD(&fake_class.classes_list); + + memset(&fake_comp1, 0, sizeof(fake_comp1)); + memset(&fake_comp2, 0, sizeof(fake_comp2)); + fake_comp1.comp_id = 1; + fake_comp2.comp_id = 2; + INIT_LIST_HEAD(&fake_comp1.vmsc); + INIT_LIST_HEAD(&fake_comp1.class_list); + INIT_LIST_HEAD(&fake_comp2.vmsc); + INIT_LIST_HEAD(&fake_comp2.class_list); + + memset(&fake_vmsc1, 0, sizeof(fake_vmsc1)); + memset(&fake_vmsc2, 0, sizeof(fake_vmsc2)); + INIT_LIST_HEAD(&fake_vmsc1.ris); + INIT_LIST_HEAD(&fake_vmsc1.comp_list); + fake_vmsc1.msc = &fake_msc1; + INIT_LIST_HEAD(&fake_vmsc2.ris); + INIT_LIST_HEAD(&fake_vmsc2.comp_list); + fake_vmsc2.msc = &fake_msc2; + + memset(&fake_ris1, 0, sizeof(fake_ris1)); + memset(&fake_ris2, 0, sizeof(fake_ris2)); + fake_ris1.ris_idx = 1; + INIT_LIST_HEAD(&fake_ris1.msc_list); + fake_ris2.ris_idx = 2; + INIT_LIST_HEAD(&fake_ris2.msc_list); + + fake_msc1.pdev = &fake_pdev; + fake_msc2.pdev = &fake_pdev; + + list_add(&fake_class.classes_list, &fake_classes_list); +} + +static void test_mpam_enable_merge_features(struct kunit *test) +{ + reset_fake_hierarchy(); + + mutex_lock(&mpam_list_lock); + + /* One Class+Comp, two RIS in one vMSC with common features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two RIS in one vMSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = NULL; + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc1; + list_add(&fake_ris2.vmsc_list, &fake_vmsc1.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* Multiple RIS within one MSC controlling the same resource can be mismatched */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_vmsc1.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + KUNIT_EXPECT_EQ(test, fake_vmsc1.props.cmax_wd, 4); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with incompatible overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 5; + fake_ris2.props.cpbm_wd = 3; + fake_ris1.props.mbw_pbm_bits = 5; + fake_ris2.props.mbw_pbm_bits = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple RIS in different MSC can't control the same resource, + * mismatched features can not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_mbw_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.mbw_pbm_bits, 0); + + reset_fake_hierarchy(); + + /* One Class+Comp, two MSC with overlapping features that need tweaking */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = NULL; + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp1; + list_add(&fake_vmsc2.comp_list, &fake_comp1.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_mbw_min, &fake_ris1.props); + mpam_set_feature(mpam_feat_mbw_min, &fake_ris2.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmax, &fake_ris2.props); + fake_ris1.props.bwa_wd = 5; + fake_ris2.props.bwa_wd = 3; + fake_ris1.props.cmax_wd = 5; + fake_ris2.props.cmax_wd = 3; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * RIS with different control properties need to be sanitised so the + * class has the common set of properties. + */ + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_mbw_min, &fake_class.props)); + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cmax_cmax, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.bwa_wd, 3); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 3); + + reset_fake_hierarchy(); + + /* One Class Two Comp with overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cpor_part, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cpbm_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + KUNIT_EXPECT_TRUE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 4); + + reset_fake_hierarchy(); + + /* One Class Two Comp with non-overlapping features */ + fake_comp1.class = &fake_class; + list_add(&fake_comp1.class_list, &fake_class.components); + fake_comp2.class = &fake_class; + list_add(&fake_comp2.class_list, &fake_class.components); + fake_vmsc1.comp = &fake_comp1; + list_add(&fake_vmsc1.comp_list, &fake_comp1.vmsc); + fake_vmsc2.comp = &fake_comp2; + list_add(&fake_vmsc2.comp_list, &fake_comp2.vmsc); + fake_ris1.vmsc = &fake_vmsc1; + list_add(&fake_ris1.vmsc_list, &fake_vmsc1.ris); + fake_ris2.vmsc = &fake_vmsc2; + list_add(&fake_ris2.vmsc_list, &fake_vmsc2.ris); + + mpam_set_feature(mpam_feat_cpor_part, &fake_ris1.props); + mpam_set_feature(mpam_feat_cmax_cmin, &fake_ris2.props); + fake_ris1.props.cpbm_wd = 4; + fake_ris2.props.cmax_wd = 4; + + mpam_enable_merge_features(&fake_classes_list); + + /* + * Multiple components can't control the same resource, mismatched features can + * not be supported. + */ + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cpor_part, &fake_class.props)); + KUNIT_EXPECT_FALSE(test, mpam_has_feature(mpam_feat_cmax_cmin, &fake_class.props)); + KUNIT_EXPECT_EQ(test, fake_class.props.cpbm_wd, 0); + KUNIT_EXPECT_EQ(test, fake_class.props.cmax_wd, 0); + + mutex_unlock(&mpam_list_lock); +} + +static void test_mpam_reset_msc_bitmap(struct kunit *test) +{ + char __iomem *buf = kunit_kzalloc(test, SZ_16K, GFP_KERNEL); + struct mpam_msc fake_msc = {}; + u32 *test_result; + + if (!buf) + return; + + fake_msc.mapped_hwpage = buf; + fake_msc.mapped_hwpage_sz = SZ_16K; + cpumask_copy(&fake_msc.accessibility, cpu_possible_mask); + + /* Satisfy lockdep checks */ + mutex_init(&fake_msc.part_sel_lock); + mutex_lock(&fake_msc.part_sel_lock); + + test_result = (u32 *)(buf + MPAMCFG_CPBM); + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 0); + KUNIT_EXPECT_EQ(test, test_result[0], 0); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 1); + KUNIT_EXPECT_EQ(test, test_result[0], 1); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 16); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 32); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 0); + test_result[0] = 0; + test_result[1] = 0; + + mpam_reset_msc_bitmap(&fake_msc, MPAMCFG_CPBM, 33); + KUNIT_EXPECT_EQ(test, test_result[0], 0xffffffff); + KUNIT_EXPECT_EQ(test, test_result[1], 1); + test_result[0] = 0; + test_result[1] = 0; + + mutex_unlock(&fake_msc.part_sel_lock); +} + +static struct kunit_case mpam_devices_test_cases[] = { + KUNIT_CASE(test_mpam_reset_msc_bitmap), + KUNIT_CASE(test_mpam_enable_merge_features), + KUNIT_CASE(test__props_mismatch), + {} +}; + +static struct kunit_suite mpam_devices_test_suite = { + .name = "mpam_devices_test_suite", + .test_cases = mpam_devices_test_cases, +}; + +kunit_test_suites(&mpam_devices_test_suite); diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c new file mode 100644 index 0000000000000..b93d6ad87e43f --- /dev/null +++ b/drivers/resctrl/test_mpam_resctrl.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2025 Arm Ltd. +/* This file is intended to be included into mpam_resctrl.c */ + +#include +#include +#include +#include +#include + +struct percent_value_case { + u8 pc; + u8 width; + u16 value; +}; + +/* + * Mysterious inscriptions taken from the union of ARM DDI 0598D.b, + * "Arm Architecture Reference Manual Supplement - Memory System + * Resource Partitioning and Monitoring (MPAM), for A-profile + * architecture", Section 9.8, "About the fixed-point fractional + * format" (exact percentage entries only) and ARM IHI0099B.a + * "MPAM system component specification", Section 9.3, + * "The fixed-point fractional format": + */ +static const struct percent_value_case percent_value_cases[] = { + /* Architectural cases: */ + { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e }, + { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff }, + { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 }, + { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 }, + { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 }, + { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff }, + { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d }, + { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb }, + { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 }, + { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff }, + { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb }, + { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 }, + { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 }, + { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff }, +}; + +static void test_percent_value_desc(const struct percent_value_case *param, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, + "pc=%d, width=%d, value=0x%.*x\n", + param->pc, param->width, + DIV_ROUND_UP(param->width, 4), param->value); +} + +KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases, + test_percent_value_desc); + +struct percent_value_test_info { + u32 pc; /* result of value-to-percent conversion */ + u32 value; /* result of percent-to-value conversion */ + u32 max_value; /* maximum raw value allowed by test params */ + unsigned int shift; /* promotes raw testcase value to 16 bits */ +}; + +/* + * Convert a reference percentage to a fixed-point MAX value and + * vice-versa, based on param (not test->param_value!) + */ +static void __prepare_percent_value_test(struct kunit *test, + struct percent_value_test_info *res, + const struct percent_value_case *param) +{ + struct mpam_props fake_props = { }; + + /* Reject bogus test parameters that would break the tests: */ + KUNIT_ASSERT_GE(test, param->width, 1); + KUNIT_ASSERT_LE(test, param->width, 16); + KUNIT_ASSERT_LT(test, param->value, 1 << param->width); + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = param->width; + + res->shift = 16 - param->width; + res->max_value = GENMASK_U32(param->width - 1, 0); + res->value = percent_to_mbw_max(param->pc, &fake_props); + res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props); +} + +static void test_get_mba_granularity(struct kunit *test) +{ + int ret; + struct mpam_props fake_props = { }; + + /* Use MBW_MAX */ + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + + fake_props.bwa_wd = 0; + KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props)); + + fake_props.bwa_wd = 1; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* Architectural maximum: */ + fake_props.bwa_wd = 16; + KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props)); + + /* No usable control... */ + fake_props.bwa_wd = 0; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 0); + + fake_props.bwa_wd = 1; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */ + + fake_props.bwa_wd = 2; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */ + + fake_props.bwa_wd = 3; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */ + + fake_props.bwa_wd = 6; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */ + + fake_props.bwa_wd = 7; + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */ + + /* Granularity saturates at 1% */ + fake_props.bwa_wd = 16; /* architectural maximum */ + ret = get_mba_granularity(&fake_props); + KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */ +} + +static void test_mbw_max_to_percent(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + /* + * Since the reference values in percent_value_cases[] all + * correspond to exact percentages, round-to-nearest will + * always give the exact percentage back when the MPAM max + * value has precision of 0.5% or finer. (Always true for the + * reference data, since they all specify 8 bits or more of + * precision. + * + * So, keep it simple and demand an exact match: + */ + __prepare_percent_value_test(test, &res, param); + KUNIT_EXPECT_EQ(test, res.pc, param->pc); +} + +static void test_percent_to_mbw_max(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + struct percent_value_test_info res; + + __prepare_percent_value_test(test, &res, param); + + KUNIT_EXPECT_GE(test, res.value, param->value << res.shift); + KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift); + KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift); + + /* No flexibility allowed for 0% and 100%! */ + + if (param->pc == 0) + KUNIT_EXPECT_EQ(test, res.value, 0); + + if (param->pc == 100) + KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift); +} + +static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev, + char *desc) +{ + uintptr_t param = (uintptr_t)prev; + + if (param > 15) + return NULL; + + param++; + + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param); + + return (void *)param; +} + +static unsigned int test_get_bwa_wd(struct kunit *test) +{ + uintptr_t param = (uintptr_t)test->param_value; + + KUNIT_ASSERT_GE(test, param, 1); + KUNIT_ASSERT_LE(test, param, 16); + + return param; +} + +static void test_mbw_max_to_percent_limits(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + u32 max_value; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + max_value = GENMASK(15, 16 - fake_props.bwa_wd); + + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props), + MAX_MBA_BW); + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), + get_mba_min(&fake_props)); + + /* + * Rounding policy dependent 0% sanity-check: + * With round-to-nearest, the minimum mbw_max value really + * should map to 0% if there are at least 200 steps. + * (100 steps may be enough for some other rounding policies.) + */ + if (fake_props.bwa_wd >= 8) + KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0); + + if (fake_props.bwa_wd < 8 && + mbw_max_to_percent(0, &fake_props) == 0) + kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?", + fake_props.bwa_wd); +} + +/* + * Check that converting a percentage to mbw_max and back again (or, as + * appropriate, vice-versa) always restores the original value: + */ +static void test_percent_max_roundtrip_stability(struct kunit *test) +{ + struct mpam_props fake_props = {0}; + unsigned int shift; + u32 pc, max, pc2, max2; + + mpam_set_feature(mpam_feat_mbw_max, &fake_props); + fake_props.bwa_wd = test_get_bwa_wd(test); + shift = 16 - fake_props.bwa_wd; + + /* + * Converting a valid value from the coarser scale to the finer + * scale and back again must yield the original value: + */ + if (fake_props.bwa_wd >= 7) { + /* More than 100 steps: only test exact pc values: */ + for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) { + max = percent_to_mbw_max(pc, &fake_props); + pc2 = mbw_max_to_percent(max, &fake_props); + KUNIT_EXPECT_EQ(test, pc2, pc); + } + } else { + /* Fewer than 100 steps: only test exact mbw_max values: */ + for (max = 0; max < 1 << 16; max += 1 << shift) { + pc = mbw_max_to_percent(max, &fake_props); + max2 = percent_to_mbw_max(pc, &fake_props); + KUNIT_EXPECT_EQ(test, max2, max); + } + } +} + +static void test_percent_to_max_rounding(struct kunit *test) +{ + const struct percent_value_case *param = test->param_value; + unsigned int num_rounded_up = 0, total = 0; + struct percent_value_test_info res; + + for (param = percent_value_cases, total = 0; + param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)]; + param++, total++) { + __prepare_percent_value_test(test, &res, param); + if (res.value > param->value << res.shift) + num_rounded_up++; + } + + /* + * The MPAM driver applies a round-to-nearest policy, whereas a + * round-down policy seems to have been applied in the + * reference table from which the test vectors were selected. + * + * For a large and well-distributed suite of test vectors, + * about half should be rounded up and half down compared with + * the reference table. The actual test vectors are few in + * number and probably not very well distributed however, so + * tolerate a round-up rate of between 1/4 and 3/4 before + * crying foul: + */ + + kunit_info(test, "Round-up rate: %u%% (%u/%u)\n", + DIV_ROUND_CLOSEST(num_rounded_up * 100, total), + num_rounded_up, total); + + KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total); + KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total); +} + +static struct kunit_case mpam_resctrl_test_cases[] = { + KUNIT_CASE(test_get_mba_granularity), + KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params), + KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params), + KUNIT_CASE(test_percent_to_max_rounding), + KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability, + test_all_bwa_wd_gen_params), + {} +}; + +static struct kunit_suite mpam_resctrl_test_suite = { + .name = "mpam_resctrl_test_suite", + .test_cases = mpam_resctrl_test_cases, +}; + +kunit_test_suites(&mpam_resctrl_test_suite); diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f546652..2b9fca00e9e8b 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c8..53f59226ae013 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 3dc3c2432b5e0..4232ce33a2591 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include /* * The device memory usable to the workloads running in the VM is cached @@ -47,6 +50,7 @@ struct mem_region { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ + struct pfn_address_space pfn_address_space; }; struct nvgrace_gpu_pci_core_device { @@ -58,6 +62,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -84,6 +90,80 @@ nvgrace_gpu_memregion(int index, return NULL; } +static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev, + unsigned int index, + unsigned long pfn, + pgoff_t *pfn_offset_in_region) +{ + struct mem_region *region; + unsigned long start_pfn, num_pages; + + region = nvgrace_gpu_memregion(index, nvdev); + if (!region) + return -EINVAL; + + start_pfn = PHYS_PFN(region->memphys); + num_pages = region->memlength >> PAGE_SHIFT; + + if (pfn < start_pfn || pfn >= start_pfn + num_pages) + return -EFAULT; + + *pfn_offset_in_region = pfn - start_pfn; + + return 0; +} + +static inline +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma); + +static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma, + unsigned long pfn, + pgoff_t *pgoff) +{ + struct nvgrace_gpu_pci_core_device *nvdev; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + pgoff_t vma_offset_in_region = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + pgoff_t pfn_offset_in_region; + int ret; + + nvdev = vma_to_nvdev(vma); + if (!nvdev) + return -ENOENT; + + ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region); + if (ret) + return ret; + + /* Ensure PFN is not before VMA's start within the region */ + if (pfn_offset_in_region < vma_offset_in_region) + return -EFAULT; + + /* Calculate offset from VMA start */ + *pgoff = vma->vm_pgoff + + (pfn_offset_in_region - vma_offset_in_region); + + return 0; +} + +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev, + struct mem_region *region) +{ + unsigned long pfn, nr_pages; + + pfn = PHYS_PFN(region->memphys); + nr_pages = region->memlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = pfn; + region->pfn_address_space.node.last = pfn + nr_pages - 1; + region->pfn_address_space.mapping = core_vdev->inode->i_mapping; + region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff; + + return register_pfn_address_space(®ion->pfn_address_space); +} + static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = @@ -102,9 +182,36 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } - vfio_pci_core_finish_enable(vdev); + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) + goto error_exit; + + if (nvdev->resmem.memlength) { + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem); + if (ret && ret != -EOPNOTSUPP) + goto error_exit; + } + + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem); + if (ret && ret != -EOPNOTSUPP) + goto register_mem_failed; + vfio_pci_core_finish_enable(vdev); return 0; + +register_mem_failed: + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); +error_exit: + vfio_pci_core_disable(vdev); + return ret; } static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) @@ -113,6 +220,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); + /* Unmap the mapping to the device memory cached region */ if (nvdev->usemem.memaddr) { memunmap(nvdev->usemem.memaddr); @@ -130,6 +242,116 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + +static inline +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma) +{ + /* Check if this VMA belongs to us */ + if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops) + return NULL; + + return vma->vm_private_data; +} + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -137,10 +359,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -157,17 +377,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -184,23 +405,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; return 0; } @@ -510,6 +716,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -536,9 +743,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -563,9 +776,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -627,6 +847,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -656,9 +877,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -672,10 +899,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -683,6 +917,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -703,6 +981,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -893,11 +1175,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -913,16 +1194,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -939,7 +1212,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; @@ -965,6 +1238,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); @@ -1002,12 +1278,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2f..6d41cf26b5399 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -147,6 +147,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +165,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4b..dc4e510e6e1bf 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. @@ -589,10 +590,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +630,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +712,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +909,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +994,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 53a846d3e6758..b472cd93eee40 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,6 +28,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_EEH) #include #endif @@ -320,6 +321,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -333,11 +336,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -354,12 +355,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -407,14 +406,14 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -738,6 +737,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); @@ -1264,7 +1265,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1510,11 +1514,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1541,16 +1544,21 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -1781,18 +1789,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -2116,6 +2115,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2125,6 +2125,10 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2489,6 +2493,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2517,8 +2522,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 0000000000000..d4d0f7d08c53e --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include +#include +#include + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +/* + * This is a temporary "private interconnect" between VFIO DMABUF and iommufd. + * It allows the two co-operating drivers to exchange the physical address of + * the BAR. This is to be replaced with a formal DMABUF system for negotiated + * interconnect types. + * + * If this function succeeds the following are true: + * - There is one physical range and it is pointing to MMIO + * - When move_notify is called it means revoke, not move, vfio_dma_buf_map + * will fail if it is currently revoked + */ +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys) +{ + struct vfio_pci_dma_buf *priv; + + dma_resv_assert_held(attachment->dmabuf->resv); + + if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops) + return -EOPNOTSUPP; + + priv = attachment->dmabuf->priv; + if (priv->revoked) + return -ENODEV; + + /* More than one range to iommufd will require proper DMABUF support */ + if (priv->nr_ranges != 1) + return -EOPNOTSUPP; + + *phys = priv->phys_vec[0]; + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd"); + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 97d992c063229..27ac280f00b97 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -64,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, @@ -111,4 +110,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60e..9aa4a5d081e8f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 0d0ef54fc4de1..cc4237c57cbe4 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -24,7 +25,8 @@ #include "internal.h" struct rdt_parse_data { - struct rdtgroup *rdtgrp; + u32 closid; + enum rdtgrp_mode mode; char *buf; }; @@ -77,8 +79,8 @@ static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 bw_val; cfg = &d->staged_config[s->conf_type]; @@ -156,9 +158,10 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, struct rdt_ctrl_domain *d) { - struct rdtgroup *rdtgrp = data->rdtgrp; + enum rdtgrp_mode mode = data->mode; struct resctrl_staged_config *cfg; struct rdt_resource *r = s->res; + u32 closid = data->closid; u32 cbm_val; cfg = &d->staged_config[s->conf_type]; @@ -171,7 +174,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * Cannot set up more than one pseudo-locked region in a cache * hierarchy. */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + if (mode == RDT_MODE_PSEUDO_LOCKSETUP && rdtgroup_pseudo_locked_in_hierarchy(d)) { rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); return -EINVAL; @@ -180,8 +183,7 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, if (!cbm_validate(data->buf, &cbm_val, r)) return -EINVAL; - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && + if ((mode == RDT_MODE_EXCLUSIVE || mode == RDT_MODE_SHAREABLE) && rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); return -EINVAL; @@ -191,14 +193,14 @@ static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, * The CBM may not overlap with the CBM of another closid if * either is exclusive. */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, true)) { rdt_last_cmd_puts("Overlaps with exclusive group\n"); return -EINVAL; } - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + if (rdtgroup_cbm_overlaps(s, d, cbm_val, closid, false)) { + if (mode == RDT_MODE_EXCLUSIVE || + mode == RDT_MODE_PSEUDO_LOCKSETUP) { rdt_last_cmd_puts("Overlaps with other group\n"); return -EINVAL; } @@ -262,7 +264,8 @@ static int parse_line(char *line, struct resctrl_schema *s, list_for_each_entry(d, &r->ctrl_domains, hdr.list) { if (d->hdr.id == dom_id) { data.buf = dom; - data.rdtgrp = rdtgrp; + data.closid = rdtgrp->closid; + data.mode = rdtgrp->mode; if (parse_ctrlval(&data, s, d)) return -EINVAL; if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { @@ -381,7 +384,8 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, return ret ?: nbytes; } -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, + char *resource_name, int closid) { struct rdt_resource *r = schema->res; struct rdt_ctrl_domain *dom; @@ -391,7 +395,8 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - seq_printf(s, "%*s:", max_name_width, schema->name); + if (resource_name) + seq_printf(s, "%*s:", max_name_width, resource_name); list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_puts(s, ";"); @@ -437,7 +442,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, closid = rdtgrp->closid; list_for_each_entry(schema, &resctrl_schema_all, list) { if (closid < schema->num_closid) - show_doms(s, schema, closid); + show_doms(s, schema, schema->name, closid); } } } else { @@ -547,8 +552,8 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, } void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, struct mon_evt *evt, int first) { int cpu; @@ -559,21 +564,26 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, * Setup the parameters to pass to mon_event_count() to read the data. */ rr->rgrp = rdtgrp; - rr->evtid = evtid; + rr->evt = evt; rr->r = r; - rr->d = d; + rr->hdr = hdr; rr->first = first; if (resctrl_arch_mbm_cntr_assign_enabled(r) && - resctrl_is_mbm_event(evtid)) { + resctrl_is_mbm_event(evt->evtid)) { rr->is_mbm_cntr = true; } else { - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evt->evtid); if (IS_ERR(rr->arch_mon_ctx)) { rr->err = -EINVAL; return; } } + if (evt->any_cpu) { + mon_event_count(rr); + goto out_ctx_free; + } + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); /* @@ -587,22 +597,93 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, else smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); +out_ctx_free: if (rr->arch_mon_ctx) - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); + resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx); +} + +/* + * Decimal place precision to use for each number of fixed-point + * binary bits computed from ceil(binary_bits * log10(2)) except + * binary_bits == 0 which will print "value.0" + */ +static const unsigned int decplaces[MAX_BINARY_BITS + 1] = { + [0] = 1, + [1] = 1, + [2] = 1, + [3] = 1, + [4] = 2, + [5] = 2, + [6] = 2, + [7] = 3, + [8] = 3, + [9] = 3, + [10] = 4, + [11] = 4, + [12] = 4, + [13] = 4, + [14] = 5, + [15] = 5, + [16] = 5, + [17] = 6, + [18] = 6, + [19] = 6, + [20] = 7, + [21] = 7, + [22] = 7, + [23] = 7, + [24] = 8, + [25] = 8, + [26] = 8, + [27] = 9 +}; + +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val) +{ + unsigned long long frac = 0; + + if (binary_bits) { + /* Mask off the integer part of the fixed-point value. */ + frac = val & GENMASK_ULL(binary_bits - 1, 0); + + /* + * Multiply by 10^{desired decimal places}. The integer part of + * the fixed point value is now almost what is needed. + */ + frac *= int_pow(10ull, decplaces[binary_bits]); + + /* + * Round to nearest by adding a value that would be a "1" in the + * binary_bits + 1 place. Integer part of fixed point value is + * now the needed value. + */ + frac += 1ull << (binary_bits - 1); + + /* + * Extract the integer part of the value. This is the decimal + * representation of the original fixed-point fractional value. + */ + frac >>= binary_bits; + } + + /* + * "frac" is now in the range [0 .. 10^decplaces). I.e. string + * representation will fit into chosen number of decimal places. + */ + seq_printf(m, "%llu.%0*llu\n", val >> binary_bits, decplaces[binary_bits], frac); } int rdtgroup_mondata_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; enum resctrl_res_level resid; - enum resctrl_event_id evtid; struct rdt_domain_hdr *hdr; struct rmid_read rr = {0}; - struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; int domid, cpu, ret = 0; struct rdt_resource *r; struct cacheinfo *ci; + struct mon_evt *evt; struct mon_data *md; rdtgrp = rdtgroup_kn_lock_live(of->kn); @@ -619,10 +700,17 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) resid = md->rid; domid = md->domid; - evtid = md->evtid; + evt = md->evt; r = resctrl_arch_get_resource(resid); if (md->sum) { + struct rdt_l3_mon_domain *d; + + if (WARN_ON_ONCE(resid != RDT_RESOURCE_L3)) { + ret = -EINVAL; + goto out; + } + /* * This file requires summing across all domains that share * the L3 cache id that was provided in the "domid" field of the @@ -637,7 +725,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) continue; rr.ci = ci; mon_event_read(&rr, r, NULL, rdtgrp, - &ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evt, false); goto checkresult; } } @@ -649,12 +737,11 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * the resource to find the domain with "domid". */ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + if (!hdr) { ret = -ENOENT; goto out; } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + mon_event_read(&rr, r, hdr, rdtgrp, &hdr->cpu_mask, evt, false); } checkresult: @@ -669,6 +756,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) seq_puts(m, "Unavailable\n"); else if (rr.err == -ENOENT) seq_puts(m, "Unassigned\n"); + else if (evt->is_floating_point) + print_event_value(m, evt->binary_bits, rr.val); else seq_printf(m, "%llu\n", rr.val); @@ -676,3 +765,280 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) rdtgroup_kn_unlock(of->kn); return ret; } + +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + mutex_lock(&rdtgroup_mutex); + + if (r->cache.io_alloc_capable) { + if (resctrl_arch_get_io_alloc_enabled(r)) + seq_puts(seq, "enabled\n"); + else + seq_puts(seq, "disabled\n"); + } else { + seq_puts(seq, "not supported\n"); + } + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +/* + * resctrl_io_alloc_closid_supported() - io_alloc feature utilizes the + * highest CLOSID value to direct I/O traffic. Ensure that io_alloc_closid + * is in the supported range. + */ +static bool resctrl_io_alloc_closid_supported(u32 io_alloc_closid) +{ + return io_alloc_closid < closids_supported(); +} + +/* + * Initialize io_alloc CLOSID cache resource CBM with all usable (shared + * and unused) cache portions. + */ +static int resctrl_io_alloc_init_cbm(struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *d; + int ret; + + rdt_staged_configs_clear(); + + ret = rdtgroup_init_cat(s, closid); + if (ret < 0) + goto out; + + /* Keep CDP_CODE and CDP_DATA of io_alloc CLOSID's CBM in sync. */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + + ret = resctrl_arch_update_domains(r, closid); +out: + rdt_staged_configs_clear(); + return ret; +} + +/* + * resctrl_io_alloc_closid() - io_alloc feature routes I/O traffic using + * the highest available CLOSID. Retrieve the maximum CLOSID supported by the + * resource. Note that if Code Data Prioritization (CDP) is enabled, the number + * of available CLOSIDs is reduced by half. + */ +u32 resctrl_io_alloc_closid(struct rdt_resource *r) +{ + if (resctrl_arch_get_cdp_enabled(r->rid)) + return resctrl_arch_get_num_closid(r) / 2 - 1; + else + return resctrl_arch_get_num_closid(r) - 1; +} + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + char const *grp_name; + u32 io_alloc_closid; + bool enable; + int ret; + + ret = kstrtobool(buf, &enable); + if (ret) + return ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + /* If the feature is already up to date, no action is needed. */ + if (resctrl_arch_get_io_alloc_enabled(r) == enable) + goto out_unlock; + + io_alloc_closid = resctrl_io_alloc_closid(r); + if (!resctrl_io_alloc_closid_supported(io_alloc_closid)) { + rdt_last_cmd_printf("io_alloc CLOSID (ctrl_hw_id) %u is not available\n", + io_alloc_closid); + ret = -EINVAL; + goto out_unlock; + } + + if (enable) { + if (!closid_alloc_fixed(io_alloc_closid)) { + grp_name = rdtgroup_name_by_closid(io_alloc_closid); + WARN_ON_ONCE(!grp_name); + rdt_last_cmd_printf("CLOSID (ctrl_hw_id) %u for io_alloc is used by %s group\n", + io_alloc_closid, grp_name ? grp_name : "another"); + ret = -ENOSPC; + goto out_unlock; + } + + ret = resctrl_io_alloc_init_cbm(s, io_alloc_closid); + if (ret) { + rdt_last_cmd_puts("Failed to initialize io_alloc allocations\n"); + closid_free(io_alloc_closid); + goto out_unlock; + } + } else { + closid_free(io_alloc_closid); + } + + ret = resctrl_arch_io_alloc_enable(r, enable); + if (enable && ret) { + rdt_last_cmd_puts("Failed to enable io_alloc feature\n"); + closid_free(io_alloc_closid); + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + int ret = 0; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + /* + * When CDP is enabled, the CBMs of the highest CLOSID of CDP_CODE and + * CDP_DATA are kept in sync. As a result, the io_alloc CBMs shown for + * either CDP resource are identical and accurately represent the CBMs + * used for I/O. + */ + show_doms(seq, s, NULL, resctrl_io_alloc_closid(r)); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static int resctrl_io_alloc_parse_line(char *line, struct rdt_resource *r, + struct resctrl_schema *s, u32 closid) +{ + enum resctrl_conf_type peer_type; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + +next: + if (!line || line[0] == '\0') + return 0; + + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.mode = RDT_MODE_SHAREABLE; + data.closid = closid; + if (parse_cbm(&data, s, d)) + return -EINVAL; + /* + * Keep io_alloc CLOSID's CBM of CDP_CODE and CDP_DATA + * in sync. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) { + peer_type = resctrl_peer_type(s->conf_type); + memcpy(&d->staged_config[peer_type], + &d->staged_config[s->conf_type], + sizeof(d->staged_config[0])); + } + goto next; + } + } + + return -EINVAL; +} + +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + u32 io_alloc_closid; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + buf[nbytes - 1] = '\0'; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!r->cache.io_alloc_capable) { + rdt_last_cmd_printf("io_alloc is not supported on %s\n", s->name); + ret = -ENODEV; + goto out_unlock; + } + + if (!resctrl_arch_get_io_alloc_enabled(r)) { + rdt_last_cmd_printf("io_alloc is not enabled on %s\n", s->name); + ret = -EINVAL; + goto out_unlock; + } + + io_alloc_closid = resctrl_io_alloc_closid(r); + + rdt_staged_configs_clear(); + ret = resctrl_io_alloc_parse_line(buf, r, s, io_alloc_closid); + if (ret) + goto out_clear_configs; + + ret = resctrl_arch_update_domains(r, io_alloc_closid); + +out_clear_configs: + rdt_staged_configs_clear(); +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index cf1fd82dc5a99..1a9b29119f88f 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -61,7 +61,14 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * READS_TO_REMOTE_MEM) being tracked by @evtid. * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable + * @any_cpu: true if the event can be read from any CPU + * @is_floating_point: event values are displayed in floating point format + * @binary_bits: number of fixed-point binary bits from architecture, + * only valid if @is_floating_point is true * @enabled: true if the event is enabled + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). */ struct mon_evt { enum resctrl_event_id evtid; @@ -69,7 +76,11 @@ struct mon_evt { char *name; u32 evt_cfg; bool configurable; + bool any_cpu; + bool is_floating_point; + unsigned int binary_bits; bool enabled; + void *arch_priv; }; extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; @@ -77,13 +88,16 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) +/* Limit for mon_evt::binary_bits */ +#define MAX_BINARY_BITS 27 + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. * @rid: Resource id associated with the event file. - * @evtid: Event id associated with the event file. - * @sum: Set when event must be summed across multiple - * domains. + * @evt: Event structure associated with the event file. + * @sum: Set for RDT_RESOURCE_L3 when event must be summed + * across multiple domains. * @domid: When @sum is zero this is the domain to which * the event file belongs. When @sum is one this * is the id of the L3 cache that all domains to be @@ -95,7 +109,7 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; struct mon_data { struct list_head list; enum resctrl_res_level rid; - enum resctrl_event_id evtid; + struct mon_evt *evt; int domid; bool sum; }; @@ -106,25 +120,27 @@ struct mon_data { * resource group then its event count is summed with the count from all * its child resource groups. * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. + * @hdr: Header of domain that the counter should be read from. If NULL then + * sum all domains in @r sharing L3 @ci.id + * @evt: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci: Cacheinfo for L3. Only set when @hdr is NULL. Used when summing + * domains. * @is_mbm_cntr: true if "mbm_event" counter assignment mode is enabled and it * is an MBM event. * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). + * @val: Returned value of event counter. If @rgrp is a parent resource + * group, @val includes the sum of event counts from its child + * resource groups. If @hdr is NULL, @val includes the sum of all + * domains in @r sharing @ci.id, (summed across child resource groups + * if @rgrp is a parent resource group). * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). */ struct rmid_read { struct rdtgroup *rgrp; struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct mon_evt *evt; bool first; struct cacheinfo *ci; bool is_mbm_cntr; @@ -243,6 +259,8 @@ struct rdtgroup { #define RFTYPE_ASSIGN_CONFIG BIT(11) +#define RFTYPE_RES_PERF_PKG BIT(12) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -351,23 +369,27 @@ int closids_supported(void); void closid_free(int closid); +int setup_rmid_lru_list(void); + +void free_rmid_lru_list(void); + int alloc_rmid(u32 closid); void free_rmid(u32 closid, u32 rmid); -void resctrl_mon_resource_exit(void); +int resctrl_l3_mon_resource_init(void); + +void resctrl_l3_mon_resource_exit(void); void mon_event_count(void *info); int rdtgroup_mondata_show(struct seq_file *m, void *arg); void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); + struct rdt_domain_hdr *hdr, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, struct mon_evt *evt, int first); -int resctrl_mon_resource_init(void); - -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); @@ -375,14 +397,14 @@ void mbm_handle_overflow(struct work_struct *work); bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu); void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); +bool has_busy_rmid(struct rdt_l3_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free); void resctrl_file_fflags_init(const char *config, unsigned long fflags); @@ -390,6 +412,8 @@ void rdt_staged_configs_clear(void); bool closid_allocated(unsigned int closid); +bool closid_alloc_fixed(u32 closid); + int resctrl_find_cleanest_closid(void); void *rdt_kn_parent_priv(struct kernfs_node *kn); @@ -426,6 +450,21 @@ int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, voi ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_io_alloc_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid); + +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type); + +ssize_t resctrl_io_alloc_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + +const char *rdtgroup_name_by_closid(u32 closid); +int resctrl_io_alloc_cbm_show(struct kernfs_open_file *of, struct seq_file *seq, + void *v); +ssize_t resctrl_io_alloc_cbm_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); +u32 resctrl_io_alloc_closid(struct rdt_resource *r); #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 572a9925bd6ca..0cd5476a483a0 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -130,16 +130,18 @@ static void limbo_release_entry(struct rmid_entry *entry) * decrement the count. If the busy count gets to zero on an RMID, we * free the RMID */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) +void __check_limbo(struct rdt_l3_mon_domain *d, bool force_free) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); u32 idx_limit = resctrl_arch_system_num_rmid_idx(); struct rmid_entry *entry; u32 idx, cur_idx = 1; void *arch_mon_ctx; + void *arch_priv; bool rmid_dirty; u64 val = 0; + arch_priv = mon_event_all[QOS_L3_OCCUP_EVENT_ID].arch_priv; arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); if (IS_ERR(arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", @@ -159,8 +161,8 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) break; entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, + if (resctrl_arch_rmid_read(r, &d->hdr, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, arch_priv, &val, arch_mon_ctx)) { rmid_dirty = true; } else { @@ -188,7 +190,7 @@ void __check_limbo(struct rdt_mon_domain *d, bool force_free) resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); } -bool has_busy_rmid(struct rdt_mon_domain *d) +bool has_busy_rmid(struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); @@ -289,7 +291,7 @@ int alloc_rmid(u32 closid) static void add_rmid_to_limbo(struct rmid_entry *entry) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; u32 idx; lockdep_assert_held(&rdtgroup_mutex); @@ -342,7 +344,7 @@ void free_rmid(u32 closid, u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, +static struct mbm_state *get_mbm_state(struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); @@ -362,7 +364,7 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, * Return: * Valid counter ID on success, or -ENOENT on failure. */ -static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_get(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -389,7 +391,7 @@ static int mbm_cntr_get(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * Valid counter ID on success, or -ENOSPC on failure. */ -static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, +static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { int cntr_id; @@ -408,24 +410,29 @@ static int mbm_cntr_alloc(struct rdt_resource *r, struct rdt_mon_domain *d, /* * mbm_cntr_free() - Clear the counter ID configuration details in the domain @d. */ -static void mbm_cntr_free(struct rdt_mon_domain *d, int cntr_id) +static void mbm_cntr_free(struct rdt_l3_mon_domain *d, int cntr_id) { memset(&d->cntr_cfg[cntr_id], 0, sizeof(*d->cntr_cfg)); } -static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +static int __l3_mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) { int cpu = smp_processor_id(); u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id = -ENOENT; struct mbm_state *m; - int err, ret; u64 tval = 0; + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) { + rr->err = -EIO; + return -EINVAL; + } + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); + if (rr->is_mbm_cntr) { - cntr_id = mbm_cntr_get(rr->r, rr->d, rdtgrp, rr->evtid); + cntr_id = mbm_cntr_get(rr->r, d, rdtgrp, rr->evt->evtid); if (cntr_id < 0) { rr->err = -ENOENT; return -EINVAL; @@ -434,31 +441,51 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) if (rr->first) { if (rr->is_mbm_cntr) - resctrl_arch_reset_cntr(rr->r, rr->d, closid, rmid, cntr_id, rr->evtid); + resctrl_arch_reset_cntr(rr->r, d, closid, rmid, cntr_id, rr->evt->evtid); else - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + resctrl_arch_reset_rmid(rr->r, d, closid, rmid, rr->evt->evtid); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (m) memset(m, 0, sizeof(struct mbm_state)); return 0; } - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - if (rr->is_mbm_cntr) - rr->err = resctrl_arch_cntr_read(rr->r, rr->d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return -EINVAL; + if (rr->is_mbm_cntr) + rr->err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, + rr->evt->evtid, &tval); + else + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, closid, rmid, + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; - rr->val += tval; + rr->val += tval; - return 0; + return 0; +} + +static int __l3_mon_event_count_sum(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + u32 closid = rdtgrp->closid; + u32 rmid = rdtgrp->mon.rmid; + struct rdt_l3_mon_domain *d; + u64 tval = 0; + int err, ret; + + /* + * Summing across domains is only done for systems that implement + * Sub-NUMA Cluster. There is no overlap with systems that support + * assignable counters. + */ + if (rr->is_mbm_cntr) { + pr_warn_once("Summing domains using assignable counters is not supported\n"); + rr->err = -EINVAL; + return -EINVAL; } /* Summing domains that share a cache, must be on a CPU for that cache. */ @@ -476,12 +503,9 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { if (d->ci_id != rr->ci->id) continue; - if (rr->is_mbm_cntr) - err = resctrl_arch_cntr_read(rr->r, d, closid, rmid, cntr_id, - rr->evtid, &tval); - else - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); + err = resctrl_arch_rmid_read(rr->r, &d->hdr, closid, rmid, + rr->evt->evtid, rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); if (!err) { rr->val += tval; ret = 0; @@ -494,6 +518,35 @@ static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) return ret; } +static int __mon_event_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) +{ + switch (rr->r->rid) { + case RDT_RESOURCE_L3: + WARN_ON_ONCE(rr->evt->any_cpu); + if (rr->hdr) + return __l3_mon_event_count(rdtgrp, rr); + else + return __l3_mon_event_count_sum(rdtgrp, rr); + case RDT_RESOURCE_PERF_PKG: { + u64 tval = 0; + + rr->err = resctrl_arch_rmid_read(rr->r, rr->hdr, rdtgrp->closid, + rdtgrp->mon.rmid, rr->evt->evtid, + rr->evt->arch_priv, + &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } + default: + rr->err = -EINVAL; + return -EINVAL; + } +} + /* * mbm_bw_count() - Update bw count from values previously read by * __mon_event_count(). @@ -511,9 +564,13 @@ static void mbm_bw_count(struct rdtgroup *rdtgrp, struct rmid_read *rr) u64 cur_bw, bytes, cur_bytes; u32 closid = rdtgrp->closid; u32 rmid = rdtgrp->mon.rmid; + struct rdt_l3_mon_domain *d; struct mbm_state *m; - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (!domain_header_is_valid(rr->hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + d = container_of(rr->hdr, struct rdt_l3_mon_domain, hdr); + m = get_mbm_state(d, closid, rmid, rr->evt->evtid); if (WARN_ON_ONCE(!m)) return; @@ -612,7 +669,7 @@ static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, * throttle MSRs already have low percentage values. To avoid * unnecessarily restricting such rdtgroups, we also increase the bandwidth. */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_l3_mon_domain *dom_mbm) { u32 closid, rmid, cur_msr_val, new_msr_val; struct mbm_state *pmbm_data, *cmbm_data; @@ -680,18 +737,18 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); } -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { struct rmid_read rr = {0}; rr.r = r; - rr.d = d; - rr.evtid = evtid; + rr.hdr = &d->hdr; + rr.evt = &mon_event_all[evtid]; if (resctrl_arch_mbm_cntr_assign_enabled(r)) { rr.is_mbm_cntr = true; } else { - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, evtid); if (IS_ERR(rr.arch_mon_ctx)) { pr_warn_ratelimited("Failed to allocate monitor context: %ld", PTR_ERR(rr.arch_mon_ctx)); @@ -709,10 +766,10 @@ static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain * mbm_bw_count(rdtgrp, &rr); if (rr.arch_mon_ctx) - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); + resctrl_arch_mon_ctx_free(rr.r, evtid, rr.arch_mon_ctx); } -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, +static void mbm_update(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp) { /* @@ -733,12 +790,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, void cqm_handle_limbo(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + d = container_of(work, struct rdt_l3_mon_domain, cqm_limbo.work); __check_limbo(d, false); @@ -761,7 +818,7 @@ void cqm_handle_limbo(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void cqm_setup_limbo_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -778,7 +835,7 @@ void mbm_handle_overflow(struct work_struct *work) { unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct list_head *head; struct rdt_resource *r; @@ -793,7 +850,7 @@ void mbm_handle_overflow(struct work_struct *work) goto out_unlock; r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); + d = container_of(work, struct rdt_l3_mon_domain, mbm_over.work); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mbm_update(r, d, prgrp); @@ -827,7 +884,7 @@ void mbm_handle_overflow(struct work_struct *work) * @exclude_cpu: Which CPU the handler should not run on, * RESCTRL_PICK_ANY_CPU to pick any CPU. */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, +void mbm_setup_overflow_handler(struct rdt_l3_mon_domain *dom, unsigned long delay_ms, int exclude_cpu) { unsigned long delay = msecs_to_jiffies(delay_ms); @@ -846,42 +903,29 @@ void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ schedule_delayed_work_on(cpu, &dom->mbm_over, delay); } -static int dom_data_init(struct rdt_resource *r) +int setup_rmid_lru_list(void) { - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); struct rmid_entry *entry = NULL; - int err = 0, i; + u32 idx_limit; u32 idx; + int i; - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } + if (!resctrl_arch_mon_capable()) + return 0; - closid_num_dirty_rmid = tmp; - } + /* + * Called on every mount, but the number of RMIDs cannot change + * after the first mount, so keep using the same set of rmid_ptrs[] + * until resctrl_exit(). Note that the limbo handler continues to + * access rmid_ptrs[] after resctrl is unmounted. + */ + if (rmid_ptrs) + return 0; + idx_limit = resctrl_arch_system_num_rmid_idx(); rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } + if (!rmid_ptrs) + return -ENOMEM; for (i = 0; i < idx_limit; i++) { entry = &rmid_ptrs[i]; @@ -894,71 +938,76 @@ static int dom_data_init(struct rdt_resource *r) /* * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). + * control group, which was setup earlier in rdtgroup_setup_default(). */ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, RESCTRL_RESERVED_RMID); entry = __rmid_entry(idx); list_del(&entry->list); -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; + return 0; } -static void dom_data_exit(struct rdt_resource *r) +void free_rmid_lru_list(void) { - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } + if (!resctrl_arch_mon_capable()) + return; + mutex_lock(&rdtgroup_mutex); kfree(rmid_ptrs); rmid_ptrs = NULL; - -out_unlock: mutex_unlock(&rdtgroup_mutex); } +#define MON_EVENT(_eventid, _name, _res, _fp) \ + [_eventid] = { \ + .name = _name, \ + .evtid = _eventid, \ + .rid = _res, \ + .is_floating_point = _fp, \ +} + /* * All available events. Architecture code marks the ones that * are supported by a system using resctrl_enable_mon_event() * to set .enabled. */ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { - [QOS_L3_OCCUP_EVENT_ID] = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_TOTAL_EVENT_ID] = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, - [QOS_L3_MBM_LOCAL_EVENT_ID] = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, - .rid = RDT_RESOURCE_L3, - }, + MON_EVENT(QOS_L3_OCCUP_EVENT_ID, "llc_occupancy", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_TOTAL_EVENT_ID, "mbm_total_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(QOS_L3_MBM_LOCAL_EVENT_ID, "mbm_local_bytes", RDT_RESOURCE_L3, false), + MON_EVENT(PMT_EVENT_ENERGY, "core_energy", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_ACTIVITY, "activity", RDT_RESOURCE_PERF_PKG, true), + MON_EVENT(PMT_EVENT_STALLS_LLC_HIT, "stalls_llc_hit", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_C1_RES, "c1_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_CORE_CYCLES, "unhalted_core_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_STALLS_LLC_MISS, "stalls_llc_miss", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_AUTO_C6_RES, "c6_res", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UNHALTED_REF_CYCLES, "unhalted_ref_cycles", RDT_RESOURCE_PERF_PKG, false), + MON_EVENT(PMT_EVENT_UOPS_RETIRED, "uops_retired", RDT_RESOURCE_PERF_PKG, false), }; -void resctrl_enable_mon_event(enum resctrl_event_id eventid) +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv) { - if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) - return; + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS || + binary_bits > MAX_BINARY_BITS)) + return false; if (mon_event_all[eventid].enabled) { pr_warn("Duplicate enable for event %d\n", eventid); - return; + return false; + } + if (binary_bits && !mon_event_all[eventid].is_floating_point) { + pr_warn("Event %d may not be floating point\n", eventid); + return false; } + mon_event_all[eventid].any_cpu = any_cpu; + mon_event_all[eventid].binary_bits = binary_bits; + mon_event_all[eventid].arch_priv = arch_priv; mon_event_all[eventid].enabled = true; + + return true; } bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) @@ -1082,7 +1131,7 @@ ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf * mbm_cntr_free_all() - Clear all the counter ID configuration details in the * domain @d. Called when mbm_assign_mode is changed. */ -static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { memset(d->cntr_cfg, 0, sizeof(*d->cntr_cfg) * r->mon.num_mbm_cntrs); } @@ -1091,7 +1140,7 @@ static void mbm_cntr_free_all(struct rdt_resource *r, struct rdt_mon_domain *d) * resctrl_reset_rmid_all() - Reset all non-architecture states for all the * supported RMIDs. */ -static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) +static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); enum resctrl_event_id evt; @@ -1112,7 +1161,7 @@ static void resctrl_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * Assign the counter if @assign is true else unassign the counter. Reset the * associated non-architectural state. */ -static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign) { @@ -1132,7 +1181,7 @@ static void rdtgroup_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain * * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1167,7 +1216,7 @@ static int rdtgroup_alloc_assign_cntr(struct rdt_resource *r, struct rdt_mon_dom * Return: * 0 on success, < 0 on failure. */ -static int rdtgroup_assign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static int rdtgroup_assign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1217,7 +1266,7 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp) * rdtgroup_free_unassign_cntr() - Unassign and reset the counter ID configuration * for the event pointed to by @mevt within the domain @d and resctrl group @rdtgrp. */ -static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int cntr_id; @@ -1238,7 +1287,7 @@ static void rdtgroup_free_unassign_cntr(struct rdt_resource *r, struct rdt_mon_d * the event structure @mevt from the domain @d and the group @rdtgrp. Unassign * the counters from all the domains if @d is NULL else unassign from @d. */ -static void rdtgroup_unassign_cntr_event(struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, +static void rdtgroup_unassign_cntr_event(struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { struct rdt_resource *r = resctrl_arch_get_resource(mevt->rid); @@ -1313,7 +1362,7 @@ static int resctrl_parse_mem_transactions(char *tok, u32 *val) static void rdtgroup_update_cntr_event(struct rdt_resource *r, struct rdtgroup *rdtgrp, enum resctrl_event_id evtid) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int cntr_id; list_for_each_entry(d, &r->mon_domains, hdr.list) { @@ -1419,7 +1468,7 @@ ssize_t resctrl_mbm_assign_mode_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; int ret = 0; bool enable; @@ -1492,7 +1541,7 @@ int resctrl_num_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1516,7 +1565,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; u32 cntrs, i; int ret = 0; @@ -1557,7 +1606,7 @@ int resctrl_available_mbm_cntrs_show(struct kernfs_open_file *of, int mbm_L3_assignments_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; struct mon_evt *mevt; int ret = 0; @@ -1620,7 +1669,7 @@ static struct mon_evt *mbm_get_mon_event_by_name(struct rdt_resource *r, char *n return NULL; } -static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, +static int rdtgroup_modify_assign_state(char *assign, struct rdt_l3_mon_domain *d, struct rdtgroup *rdtgrp, struct mon_evt *mevt) { int ret = 0; @@ -1646,7 +1695,7 @@ static int rdtgroup_modify_assign_state(char *assign, struct rdt_mon_domain *d, static int resctrl_parse_mbm_assignment(struct rdt_resource *r, struct rdtgroup *rdtgrp, char *event, char *tok) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; unsigned long dom_id = 0; char *dom_str, *id_str; struct mon_evt *mevt; @@ -1741,19 +1790,59 @@ ssize_t mbm_L3_assignments_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static int closid_num_dirty_rmid_alloc(struct rdt_resource *r) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 num_closid = resctrl_arch_get_num_closid(r); + u32 *tmp; + + /* For ARM memory ordering access to closid_num_dirty_rmid */ + mutex_lock(&rdtgroup_mutex); + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + mutex_unlock(&rdtgroup_mutex); + return -ENOMEM; + } + + closid_num_dirty_rmid = tmp; + + mutex_unlock(&rdtgroup_mutex); + } + + return 0; +} + +static void closid_num_dirty_rmid_free(void) +{ + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + mutex_lock(&rdtgroup_mutex); + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + mutex_unlock(&rdtgroup_mutex); + } +} + /** - * resctrl_mon_resource_init() - Initialise global monitoring structures. + * resctrl_l3_mon_resource_init() - Initialise global monitoring structures. * * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * specific domain. i.e. the closid_num_dirty_rmid[] used to find the CLOSID + * with the cleanest set of RMIDs. * Called once during boot after the struct rdt_resource's have been configured * but before the filesystem is mounted. * Resctrl's cpuhp callbacks may be called before this point to bring a domain * online. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -int resctrl_mon_resource_init(void) +int resctrl_l3_mon_resource_init(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); int ret; @@ -1761,7 +1850,7 @@ int resctrl_mon_resource_init(void) if (!r->mon_capable) return 0; - ret = dom_data_init(r); + ret = closid_num_dirty_rmid_alloc(r); if (ret) return ret; @@ -1803,9 +1892,12 @@ int resctrl_mon_resource_init(void) return 0; } -void resctrl_mon_resource_exit(void) +void resctrl_l3_mon_resource_exit(void) { struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - dom_data_exit(r); + if (!r->mon_capable) + return; + + closid_num_dirty_rmid_free(); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 0320360cd7a6e..cc615fc088f4e 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,11 @@ bool closid_allocated(unsigned int closid) return !test_bit(closid, closid_free_map); } +bool closid_alloc_fixed(u32 closid) +{ + return __test_and_clear_bit(closid, closid_free_map); +} + /** * rdtgroup_mode_by_closid - Return mode of resource group with closid * @closid: closid if the resource group @@ -1057,15 +1063,17 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, cpus_read_lock(); mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { if (sep) seq_putc(seq, ';'); + hw_shareable = r->cache.shareable_bits; sw_shareable = 0; exclusive = 0; seq_printf(seq, "%d=", dom->hdr.id); for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) + if (!closid_allocated(i) || + (resctrl_arch_get_io_alloc_enabled(r) && + i == resctrl_io_alloc_closid(r))) continue; ctrl_val = resctrl_arch_get_config(r, dom, i, s->conf_type); @@ -1093,6 +1101,21 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, break; } } + + /* + * When the "io_alloc" feature is enabled, a portion of the cache + * is configured for shared use between hardware and software. + * Also, when CDP is enabled the CBMs of CDP_CODE and CDP_DATA + * resources are kept in sync. So, the CBMs for "io_alloc" can + * be accessed through either resource. + */ + if (resctrl_arch_get_io_alloc_enabled(r)) { + ctrl_val = resctrl_arch_get_config(r, dom, + resctrl_io_alloc_closid(r), + s->conf_type); + hw_shareable |= ctrl_val; + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { pseudo_locked = dom->plr ? dom->plr->cbm : 0; hwb = test_bit(i, &hw_shareable); @@ -1135,7 +1158,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->mon.num_rmid); + seq_printf(seq, "%u\n", r->mon.num_rmid); return 0; } @@ -1247,7 +1270,7 @@ static int rdtgroup_mode_show(struct kernfs_open_file *of, return 0; } -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) { switch (my_type) { case CDP_CODE: @@ -1618,7 +1641,7 @@ static void mondata_config_read(struct resctrl_mon_config_info *mon_info) static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) { struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; bool sep = false; cpus_read_lock(); @@ -1666,7 +1689,7 @@ static int mbm_local_bytes_config_show(struct kernfs_open_file *of, } static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) + struct rdt_l3_mon_domain *d, u32 evtid, u32 val) { struct resctrl_mon_config_info mon_info = {0}; @@ -1707,8 +1730,8 @@ static void mbm_config_write_domain(struct rdt_resource *r, static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) { char *dom_str = NULL, *id_str; + struct rdt_l3_mon_domain *d; unsigned long dom_id, val; - struct rdt_mon_domain *d; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); @@ -1838,6 +1861,18 @@ void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_ kernfs_put(mon_kn); } +const char *rdtgroup_name_by_closid(u32 closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdt_kn_name(rdtgrp->kn); + } + + return NULL; +} + /* rdtgroup information files for one cache resource. */ static struct rftype res_common_files[] = { { @@ -1947,6 +1982,20 @@ static struct rftype res_common_files[] = { .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_thread_throttle_mode_show, }, + { + .name = "io_alloc", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_show, + .write = resctrl_io_alloc_write, + }, + { + .name = "io_alloc_cbm", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_io_alloc_cbm_show, + .write = resctrl_io_alloc_cbm_write, + }, { .name = "max_threshold_occupancy", .mode = 0644, @@ -2138,6 +2187,23 @@ static void thread_throttle_mode_init(void) RFTYPE_CTRL_INFO | RFTYPE_RES_MB); } +/* + * The resctrl file "io_alloc" is added using L3 resource. However, it results + * in this file being visible for *all* cache resources (eg. L2 cache), + * whether it supports "io_alloc" or not. + */ +static void io_alloc_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + if (r->cache.io_alloc_capable) { + resctrl_file_fflags_init("io_alloc", RFTYPE_CTRL_INFO | + RFTYPE_RES_CACHE); + resctrl_file_fflags_init("io_alloc_cbm", + RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE); + } +} + void resctrl_file_fflags_init(const char *config, unsigned long fflags) { struct rftype *rft; @@ -2330,6 +2396,8 @@ static unsigned long fflags_from_resource(struct rdt_resource *r) case RDT_RESOURCE_MBA: case RDT_RESOURCE_SMBA: return RFTYPE_RES_MB; + case RDT_RESOURCE_PERF_PKG: + return RFTYPE_RES_PERF_PKG; } return WARN_ON_ONCE(1); @@ -2716,10 +2784,12 @@ static int rdt_get_tree(struct fs_context *fc) { struct rdt_fs_context *ctx = rdt_fc2context(fc); unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; + struct rdt_l3_mon_domain *dom; struct rdt_resource *r; int ret; + DO_ONCE_SLEEPABLE(resctrl_arch_pre_mount); + cpus_read_lock(); mutex_lock(&rdtgroup_mutex); /* @@ -2730,6 +2800,10 @@ static int rdt_get_tree(struct fs_context *fc) goto out; } + ret = setup_rmid_lru_list(); + if (ret) + goto out; + ret = rdtgroup_setup_root(ctx); if (ret) goto out; @@ -3026,7 +3100,8 @@ static void rmdir_all_sub(void) * @rid: The resource id for the event file being created. * @domid: The domain id for the event file being created. * @mevt: The type of event file being created. - * @do_sum: Whether SNC summing monitors are being created. + * @do_sum: Whether SNC summing monitors are being created. Only set + * when @rid == RDT_RESOURCE_L3. */ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, struct mon_evt *mevt, @@ -3038,7 +3113,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, list_for_each_entry(priv, &mon_data_kn_priv_list, list) { if (priv->rid == rid && priv->domid == domid && - priv->sum == do_sum && priv->evtid == mevt->evtid) + priv->sum == do_sum && priv->evt == mevt) return priv; } @@ -3049,7 +3124,7 @@ static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, priv->rid = rid; priv->domid = domid; priv->sum = do_sum; - priv->evtid = mevt->evtid; + priv->evt = mevt; list_add_tail(&priv->list, &mon_data_kn_priv_list); return priv; @@ -3158,23 +3233,24 @@ static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subn } /* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. + * Remove files and directories for one SNC node. If it is the last node + * sharing an L3 cache, then remove the upper level directory containing + * the "sum" files too. */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) +static void rmdir_mondata_subdir_allrdtgrp_snc(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) { struct rdtgroup *prgrp, *crgrp; + struct rdt_l3_mon_domain *d; char subname[32]; - bool snc_mode; char name[32]; - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); + sprintf(subname, "mon_sub_%s_%02d", r->name, hdr->id); list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); @@ -3184,47 +3260,89 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, } } -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain_hdr *hdr) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) { + rmdir_mondata_subdir_allrdtgrp_snc(r, hdr); + return; + } + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); + } +} + +/* + * Create a directory for a domain and populate it with monitor files. Create + * summing monitors when @hdr is NULL. No need to initialize summing monitors. + */ +static struct kernfs_node *_mkdir_mondata_subdir(struct kernfs_node *parent_kn, char *name, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, + struct rdtgroup *prgrp, int domid) { struct rmid_read rr = {0}; + struct kernfs_node *kn; struct mon_data *priv; struct mon_evt *mevt; - int ret, domid; + int ret; + + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; for_each_mon_event(mevt) { if (mevt->rid != r->rid || !mevt->enabled) continue; - domid = do_sum ? d->ci_id : d->hdr.id; - priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); - if (WARN_ON_ONCE(!priv)) - return -EINVAL; + priv = mon_get_kn_priv(r->rid, domid, mevt, !hdr); + if (WARN_ON_ONCE(!priv)) { + ret = -EINVAL; + goto out_destroy; + } ret = mon_addfile(kn, mevt->name, priv); if (ret) - return ret; + goto out_destroy; - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + if (hdr && resctrl_is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, hdr, prgrp, &hdr->cpu_mask, mevt, true); } - return 0; + return kn; +out_destroy: + kernfs_remove(kn); + return ERR_PTR(ret); } -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) +static int mkdir_mondata_subdir_snc(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) { - struct kernfs_node *kn, *ckn; + struct kernfs_node *ckn, *kn; + struct rdt_l3_mon_domain *d; char name[32]; - bool snc_mode; - int ret = 0; - lockdep_assert_held(&rdtgroup_mutex); + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + return -EINVAL; - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + sprintf(name, "mon_%s_%02d", r->name, d->ci_id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* @@ -3233,41 +3351,41 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, */ kernfs_put(kn); } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + kn = _mkdir_mondata_subdir(parent_kn, name, NULL, r, prgrp, d->ci_id); if (IS_ERR(kn)) return PTR_ERR(kn); + } - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; + sprintf(name, "mon_sub_%s_%02d", r->name, hdr->id); + ckn = _mkdir_mondata_subdir(kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(ckn)) { + kernfs_remove(kn); + return PTR_ERR(ckn); } - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } + kernfs_activate(kn); + return 0; +} - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain_hdr *hdr, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn; + char name[32]; - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } + lockdep_assert_held(&rdtgroup_mutex); + + if (r->rid == RDT_RESOURCE_L3 && r->mon_scope == RESCTRL_L3_NODE) + return mkdir_mondata_subdir_snc(parent_kn, hdr, r, prgrp); + + sprintf(name, "mon_%s_%02d", r->name, hdr->id); + kn = _mkdir_mondata_subdir(parent_kn, name, hdr, r, prgrp, hdr->id); + if (IS_ERR(kn)) + return PTR_ERR(kn); kernfs_activate(kn); return 0; - -out_destroy: - kernfs_remove(kn); - return ret; } /* @@ -3275,7 +3393,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, * and "monitor" groups with given domain id. */ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) + struct rdt_domain_hdr *hdr) { struct kernfs_node *parent_kn; struct rdtgroup *prgrp, *crgrp; @@ -3283,12 +3401,12 @@ static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) { parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); + mkdir_mondata_subdir(parent_kn, hdr, r, crgrp); } } } @@ -3297,14 +3415,14 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, struct rdt_resource *r, struct rdtgroup *prgrp) { - struct rdt_mon_domain *dom; + struct rdt_domain_hdr *hdr; int ret; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + list_for_each_entry(hdr, &r->mon_domains, list) { + ret = mkdir_mondata_subdir(parent_kn, hdr, r, prgrp); if (ret) return ret; } @@ -3480,7 +3598,7 @@ static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schem * If there are no more shareable bits available on any domain then * the entire allocation will fail. */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) { struct rdt_ctrl_domain *d; int ret; @@ -4165,7 +4283,7 @@ static void rdtgroup_setup_default(void) mutex_unlock(&rdtgroup_mutex); } -static void domain_destroy_mon_state(struct rdt_mon_domain *d) +static void domain_destroy_l3_mon_state(struct rdt_l3_mon_domain *d) { int idx; @@ -4187,8 +4305,10 @@ void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain mutex_unlock(&rdtgroup_mutex); } -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { + struct rdt_l3_mon_domain *d; + mutex_lock(&rdtgroup_mutex); /* @@ -4196,8 +4316,15 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d * per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); + rmdir_mondata_subdir_allrdtgrp(r, hdr); + + if (r->rid != RDT_RESOURCE_L3) + goto out_unlock; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { @@ -4213,13 +4340,13 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d cancel_delayed_work(&d->cqm_limbo); } - domain_destroy_mon_state(d); - + domain_destroy_l3_mon_state(d); +out_unlock: mutex_unlock(&rdtgroup_mutex); } /** - * domain_setup_mon_state() - Initialise domain monitoring structures. + * domain_setup_l3_mon_state() - Initialise domain monitoring structures. * @r: The resource for the newly online domain. * @d: The newly online domain. * @@ -4227,11 +4354,17 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d * Called when the first CPU of a domain comes online, regardless of whether * the filesystem is mounted. * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). + * resctrl_l3_mon_resource_init(). + * + * Called during CPU online that may run as soon as CPU online callbacks + * are set up during resctrl initialization. The number of supported RMIDs + * may be reduced if additional mon_capable resources are enumerated + * at mount time. This means the rdt_l3_mon_domain::mbm_states[] and + * rdt_l3_mon_domain::rmid_busy_llc allocations may be larger than needed. * - * Returns 0 for success, or -ENOMEM. + * Return: 0 for success, or -ENOMEM. */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +static int domain_setup_l3_mon_state(struct rdt_resource *r, struct rdt_l3_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize = sizeof(*d->mbm_states[0]); @@ -4287,13 +4420,21 @@ int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d return err; } -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr) { - int err; + struct rdt_l3_mon_domain *d; + int err = -EINVAL; mutex_lock(&rdtgroup_mutex); - err = domain_setup_mon_state(r, d); + if (r->rid != RDT_RESOURCE_L3) + goto mkdir; + + if (!domain_header_is_valid(hdr, RESCTRL_MON_DOMAIN, RDT_RESOURCE_L3)) + goto out_unlock; + + d = container_of(hdr, struct rdt_l3_mon_domain, hdr); + err = domain_setup_l3_mon_state(r, d); if (err) goto out_unlock; @@ -4306,6 +4447,8 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); +mkdir: + err = 0; /* * If the filesystem is not mounted then only the default resource group * exists. Creation of its directories is deferred until mount time @@ -4313,7 +4456,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) * If resctrl is mounted, add per domain monitor data directories. */ if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); + mkdir_mondata_subdir_allrdtgrp(r, hdr); out_unlock: mutex_unlock(&rdtgroup_mutex); @@ -4339,10 +4482,10 @@ static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) } } -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) +static struct rdt_l3_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) { - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; lockdep_assert_cpus_held(); @@ -4358,7 +4501,7 @@ static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, void resctrl_offline_cpu(unsigned int cpu) { struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; + struct rdt_l3_mon_domain *d; struct rdtgroup *rdtgrp; mutex_lock(&rdtgroup_mutex); @@ -4408,13 +4551,15 @@ int resctrl_init(void) thread_throttle_mode_init(); - ret = resctrl_mon_resource_init(); + io_alloc_init(); + + ret = resctrl_l3_mon_resource_init(); if (ret) return ret; ret = sysfs_create_mount_point(fs_kobj, "resctrl"); if (ret) { - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4449,7 +4594,7 @@ int resctrl_init(void) cleanup_mountpoint: sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); return ret; } @@ -4485,7 +4630,7 @@ static bool resctrl_online_domains_exist(void) * When called by the architecture code, all CPUs and resctrl domains must be * offline. This ensures the limbo and overflow handlers are not scheduled to * run, meaning the data structures they access can be freed by - * resctrl_mon_resource_exit(). + * resctrl_l3_mon_resource_exit(). * * After resctrl_exit() returns, the architecture code should return an * error from all resctrl_arch_ functions that can do this. @@ -4512,5 +4657,6 @@ void resctrl_exit(void) * it can be used to umount resctrl. */ - resctrl_mon_resource_exit(); + resctrl_l3_mon_resource_exit(); + free_rmid_lru_list(); } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead5..ac8797f952365 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -8,6 +8,7 @@ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H +#include #include #include /* for struct resource */ #include @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); +static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance) +{ + struct acpi_table_header *table; + int status = acpi_get_table(signature, instance, &table); + + if (ACPI_FAILURE(status)) + return ERR_PTR(-ENOENT); + return table; +} +DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, @@ -1541,6 +1553,9 @@ int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); +int find_acpi_cache_level_from_id(u32 cache_id); +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1562,6 +1577,17 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } +static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, + cpumask_t *cpus) { } +static inline int find_acpi_cache_level_from_id(u32 cache_id) +{ + return -ENOENT; +} +static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, + cpumask_t *cpus) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h new file mode 100644 index 0000000000000..f92a36187a527 --- /dev/null +++ b/include/linux/arm_mpam.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __LINUX_ARM_MPAM_H +#define __LINUX_ARM_MPAM_H + +#include +#include +#include + +struct mpam_msc; + +enum mpam_msc_iface { + MPAM_IFACE_MMIO, /* a real MPAM MSC */ + MPAM_IFACE_PCC, /* a fake MPAM MSC */ +}; + +enum mpam_class_types { + MPAM_CLASS_CACHE, /* Caches, e.g. L2, L3 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ +}; + +#define MPAM_CLASS_ID_DEFAULT 255 + +#ifdef CONFIG_ACPI_MPAM +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc); + +int acpi_mpam_count_msc(void); +#else +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + return -EINVAL; +} + +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } +#endif + +#ifdef CONFIG_ARM64_MPAM_DRIVER +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id); +#else +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + return -EINVAL; +} +#endif + +bool resctrl_arch_alloc_capable(void); +bool resctrl_arch_mon_capable(void); + +void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid); +void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid); +void resctrl_arch_sched_in(struct task_struct *tsk); +bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid); +bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid); +u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid); +void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid); +u32 resctrl_arch_system_num_rmid_idx(void); + +struct rdt_resource; +void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid); +void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx); + +/* + * The CPU configuration for MPAM is cheap to write, and is only written if it + * has changed. No need for fine grained enables. + */ +static inline void resctrl_arch_enable_mon(void) { } +static inline void resctrl_arch_disable_mon(void) { } +static inline void resctrl_arch_enable_alloc(void) { } +static inline void resctrl_arch_disable_alloc(void) { } + +static inline unsigned int resctrl_arch_round_mon_val(unsigned int val) +{ + return val; +} + +/** + * mpam_register_requestor() - Register a requestor with the MPAM driver + * @partid_max: The maximum PARTID value the requestor can generate. + * @pmg_max: The maximum PMG value the requestor can generate. + * + * Registers a requestor with the MPAM driver to ensure the chosen system-wide + * minimum PARTID and PMG values will allow the requestors features to be used. + * + * Returns an error if the registration is too late, and a larger PARTID/PMG + * value has been advertised to user-space. In this case the requestor should + * not use its MPAM features. Returns 0 on success. + */ +int mpam_register_requestor(u16 partid_max, u8 pmg_max); + +#endif /* __LINUX_ARM_MPAM_H */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce63..68be522449ec5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1005,6 +1005,7 @@ static __always_inline unsigned int cpumask_size(void) #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly +#define CPUMASK_VAR_NULL NULL bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -1051,6 +1052,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly +#define CPUMASK_VAR_NULL {} static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h new file mode 100644 index 0000000000000..a3c0ce2d3a42f --- /dev/null +++ b/include/linux/dma-buf-mapping.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * DMA BUF Mapping Helpers + * + */ +#ifndef __DMA_BUF_MAPPING_H__ +#define __DMA_BUF_MAPPING_H__ +#include + +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir); +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir); +#endif diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d58e329ac0e71..0bc492090237e 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -530,6 +531,16 @@ struct dma_buf_export_info { void *priv; }; +/** + * struct dma_buf_phys_vec - describe continuous chunk of memory + * @paddr: physical address of that chunk + * @len: Length of this chunk + */ +struct dma_buf_phys_vec { + phys_addr_t paddr; + size_t len; +}; + /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 66e4abb2df0dc..723528e481506 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -271,6 +271,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 0000000000000..d333dcdbeae70 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; + int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma, + unsigned long pfn, pgoff_t *pgoff); +}; + +#ifdef CONFIG_MEMORY_FAILURE +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); +#else +static inline int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + return -EOPNOTSUPP; +} + +static inline void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ +} +#endif /* CONFIG_MEMORY_FAILURE */ + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 1e74eb7267acb..bf42a8bcd6bac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4171,6 +4171,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 951f81a38f3af..517e121d2598e 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -16,7 +16,58 @@ struct block_device; struct scatterlist; +/** + * struct p2pdma_provider + * + * A p2pdma provider is a range of MMIO address space available to the CPU. + */ +struct p2pdma_provider { + struct device *owner; + u64 bus_offset; +}; + +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + #ifdef CONFIG_PCI_P2PDMA +int pcim_p2pdma_init(struct pci_dev *pdev); +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, @@ -33,7 +84,18 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev); #else /* CONFIG_PCI_P2PDMA */ +static inline int pcim_p2pdma_init(struct pci_dev *pdev) +{ + return -EOPNOTSUPP; +} +static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, + int bar) +{ + return NULL; +} static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { @@ -85,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +{ + return PCI_P2PDMA_MAP_NOT_SUPPORTED; +} #endif /* CONFIG_PCI_P2PDMA */ @@ -99,51 +166,12 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before - * the mapping type has been calculated. Exported routines for the API - * will never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; + struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; - u64 bus_off; }; + /* helper for pci_p2pdma_state(), do not use directly */ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page); @@ -162,8 +190,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); + __pci_p2pdma_update_state(state, dev, page); return state->map; } return PCI_P2PDMA_MAP_NONE; @@ -172,16 +199,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, /** * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. - * @state: P2P state structure + * @provider: P2P provider structure * @paddr: physical address to map * * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. */ static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr) { - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; + return paddr + provider->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index bc843d76066a5..8299c0541dd2e 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -227,6 +227,7 @@ extern int platform_device_add_data(struct platform_device *pdev, extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); extern void platform_device_put(struct platform_device *pdev); +DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T)) struct platform_driver { int (*probe)(struct platform_device *); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653f..006e57fd7ca58 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -53,6 +53,7 @@ enum resctrl_res_level { RDT_RESOURCE_L2, RDT_RESOURCE_MBA, RDT_RESOURCE_SMBA, + RDT_RESOURCE_PERF_PKG, /* Must be the last */ RDT_NUM_RESOURCES, @@ -131,15 +132,24 @@ enum resctrl_domain_type { * @list: all instances of this resource * @id: unique id for this instance * @type: type of this instance + * @rid: resource id for this instance * @cpu_mask: which CPUs share this resource */ struct rdt_domain_hdr { struct list_head list; int id; enum resctrl_domain_type type; + enum resctrl_res_level rid; struct cpumask cpu_mask; }; +static inline bool domain_header_is_valid(struct rdt_domain_hdr *hdr, + enum resctrl_domain_type type, + enum resctrl_res_level rid) +{ + return !WARN_ON_ONCE(hdr->type != type || hdr->rid != rid); +} + /** * struct rdt_ctrl_domain - group of CPUs sharing a resctrl control resource * @hdr: common header for different domain types @@ -169,7 +179,7 @@ struct mbm_cntr_cfg { }; /** - * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource + * struct rdt_l3_mon_domain - group of CPUs sharing RDT_RESOURCE_L3 monitoring * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold @@ -183,7 +193,7 @@ struct mbm_cntr_cfg { * @cntr_cfg: array of assignable counters' configuration (indexed * by counter ID) */ -struct rdt_mon_domain { +struct rdt_l3_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; @@ -206,6 +216,8 @@ struct rdt_mon_domain { * @arch_has_sparse_bitmasks: True if a bitmask like f00f is valid. * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache * level has CPU scope. + * @io_alloc_capable: True if portion of the cache can be configured + * for I/O traffic. */ struct resctrl_cache { unsigned int cbm_len; @@ -213,6 +225,7 @@ struct resctrl_cache { unsigned int shareable_bits; bool arch_has_sparse_bitmasks; bool arch_has_per_cpu_cfg; + bool io_alloc_capable; }; /** @@ -258,6 +271,7 @@ enum resctrl_scope { RESCTRL_L2_CACHE = 2, RESCTRL_L3_CACHE = 3, RESCTRL_L3_NODE, + RESCTRL_PACKAGE, }; /** @@ -281,7 +295,7 @@ enum resctrl_schema_fmt { * events of monitor groups created via mkdir. */ struct resctrl_mon { - int num_rmid; + u32 num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; @@ -355,10 +369,10 @@ struct resctrl_cpu_defaults { }; struct resctrl_mon_config_info { - struct rdt_resource *r; - struct rdt_mon_domain *d; - u32 evtid; - u32 mon_config; + struct rdt_resource *r; + struct rdt_l3_mon_domain *d; + u32 evtid; + u32 mon_config; }; /** @@ -400,7 +414,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); -void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, + unsigned int binary_bits, void *arch_priv); bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); @@ -495,22 +510,31 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type); int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d); -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_domain_hdr *hdr); void resctrl_online_cpu(unsigned int cpu); void resctrl_offline_cpu(unsigned int cpu); +/* + * Architecture hook called at beginning of first file system mount attempt. + * No locks are held. + */ +void resctrl_arch_pre_mount(void); + /** * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid * for this resource and domain. * @r: resource that the counter should be read from. - * @d: domain that the counter should be read from. + * @hdr: Header of domain that the counter should be read from. * @closid: closid that matches the rmid. Depending on the architecture, the * counter may match traffic of both @closid and @rmid, or @rmid * only. * @rmid: rmid of the counter to read. * @eventid: eventid to read, e.g. L3 occupancy. + * @arch_priv: Architecture private data for this event. + * The @arch_priv provided by the architecture via + * resctrl_enable_mon_event(). * @val: result of the counter read in bytes. * @arch_mon_ctx: An architecture specific value from * resctrl_arch_mon_ctx_alloc(), for MPAM this identifies @@ -526,9 +550,9 @@ void resctrl_offline_cpu(unsigned int cpu); * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr, u32 closid, u32 rmid, enum resctrl_event_id eventid, - u64 *val, void *arch_mon_ctx); + void *arch_priv, u64 *val, void *arch_mon_ctx); /** * resctrl_arch_rmid_read_context_check() - warn about invalid contexts @@ -573,7 +597,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id eventid); @@ -586,7 +610,7 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d); +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d); /** * resctrl_arch_reset_all_ctrls() - Reset the control for each CLOSID to its @@ -612,7 +636,7 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); * * This can be called from any CPU. */ -void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); @@ -635,7 +659,7 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, * Return: * 0 on success, or -EIO, -EINVAL etc on error. */ -int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid, u64 *val); @@ -650,10 +674,31 @@ int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, * * This can be called from any CPU. */ -void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); +/** + * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature. + * @r: The resctrl resource. + * @enable: Enable (true) or disable (false) io_alloc on resource @r. + * + * This can be called from any CPU. + * + * Return: + * 0 on success, <0 on error. + */ +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable); + +/** + * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state. + * @r: The resctrl resource. + * + * Return: + * true if io_alloc is enabled or false if disabled. + */ +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index acfe07860b346..a5f56faa18d22 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -50,6 +50,17 @@ enum resctrl_event_id { QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, + /* Intel Telemetry Events */ + PMT_EVENT_ENERGY, + PMT_EVENT_ACTIVITY, + PMT_EVENT_STALLS_LLC_HIT, + PMT_EVENT_C1_RES, + PMT_EVENT_UNHALTED_CORE_CYCLES, + PMT_EVENT_STALLS_LLC_MISS, + PMT_EVENT_AUTO_C6_RES, + PMT_EVENT_UNHALTED_REF_CYCLES, + PMT_EVENT_UOPS_RETIRED, + /* Must be the last */ QOS_NUM_EVENTS, }; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 18fc36aadc326..1c0d8c44d25fd 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -295,6 +295,8 @@ static inline void vfio_put_device(struct vfio_device *device) int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +void vfio_device_put_registration(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 0ddc42732647e..5f79a9bb0ac5f 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -27,6 +27,9 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct p2pdma_provider; +struct dma_buf_phys_vec; +struct dma_buf_attachment; struct vfio_pci_eventfd { struct eventfd_ctx *ctx; @@ -55,9 +58,48 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF) +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len); +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +#else +static inline int +vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + return -EINVAL; +} +static inline int vfio_pci_core_get_dmabuf_phys( + struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider, + unsigned int region_index, struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, size_t nr_ranges) +{ + return -EOPNOTSUPP; +} +#endif + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map; @@ -100,6 +142,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; }; enum vfio_pci_io_width { @@ -151,6 +194,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite, enum vfio_pci_io_width max_width); +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, loff_t *buf_offset, @@ -188,4 +232,7 @@ static inline bool is_aligned_for_order(struct vm_area_struct *vma, !IS_ALIGNED(pfn, 1 << order))); } +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); + #endif /* VFIO_PCI_CORE_H */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c9f0b1018bcce..892e17d1953d5 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -414,6 +414,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index e210fd2d16ffa..c71dc07751903 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -656,11 +656,15 @@ enum iommu_hw_info_type { * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it * when the struct * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED: ATS is not supported or cannot be used + * on this device (absence implies ATS + * may be enabled) */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, + IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED = 1 << 3, }; /** diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009baf..ac2329f241417 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include #include +#include #define VFIO_API_VERSION 0 @@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the + * regions selected. + * + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, + * etc. offset/length specify a slice of the region to create the dmabuf from. + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. + * + * flags should be 0. + * + * Return: The fd number on success, -1 and errno is set on failure. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 + +struct vfio_region_dma_range { + __u64 offset; + __u64 length; +}; + +struct vfio_device_feature_dma_buf { + __u32 region_index; + __u32 open_flags; + __u32 flags; + __u32 nr_ranges; + struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges); +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f973e7e73c90b..50c3fe2a1d550 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -479,8 +479,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, } break; case PCI_P2PDMA_MAP_BUS_ADDR: - sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(sg)); + sg->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(sg)); sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; diff --git a/mm/Kconfig b/mm/Kconfig index 76001e9ba04f1..9e055c377384b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -750,6 +750,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/hmm.c b/mm/hmm.c index a56081d67ad69..25b13c71cbe65 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -811,7 +811,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map, break; case PCI_P2PDMA_MAP_BUS_ADDR: pfns[idx] |= HMM_PFN_P2PDMA_BUS | HMM_PFN_DMA_MAPPED; - return pci_p2pdma_bus_addr_map(p2pdma_state, paddr); + return pci_p2pdma_bus_addr_map(p2pdma_state->mem, paddr); default: return DMA_MAPPING_ERROR; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6e770bad79ce3..067569332da4f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -154,6 +155,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -889,6 +894,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1281,7 +1287,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); update_per_node_mf_stats(pfn, result); } @@ -2165,6 +2171,142 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (!pfn_space->pfn_to_vma_pgoff) + return -EINVAL; + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + return -EBUSY; + + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + guard(mutex)(&pfn_space_lock); + + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static void add_to_kill_pgoff(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + pgoff_t pgoff) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pgoff, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct pfn_address_space *pfn_space, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct address_space *mapping = pfn_space->mapping; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) { + pgoff_t pgoff; + + if (vma->vm_mm == t->mm && + !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff)) + add_to_kill_pgoff(t, vma, to_kill, pgoff); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + LIST_HEAD(tokill); + + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + + /* + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + + collect_procs_pfn(pfn_space, pfn, &tokill); + + mf_handled = true; + } + + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); + } + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2214,6 +2356,14 @@ int memory_failure(unsigned long pfn, int flags) if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 1f52140d9d9ee..b206b4cacd9bb 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -1572,6 +1572,49 @@ TEST_F(iommufd_ioas, copy_sweep) test_ioctl_destroy(dst_ioas_id); } +TEST_F(iommufd_ioas, dmabuf_simple) +{ + size_t buf_size = PAGE_SIZE*4; + __u64 iova; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, 0, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, buf_size, buf_size, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, buf_size + 1, &iova); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + + close(dfd); +} + +TEST_F(iommufd_ioas, dmabuf_revoke) +{ + size_t buf_size = PAGE_SIZE*4; + __u32 hwpt_id; + __u64 iova; + __u64 iova2; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + test_cmd_revoke_dmabuf(dfd, true); + + if (variant->mock_domains) + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &hwpt_id); + + test_err_ioctl_ioas_map_file(ENODEV, dfd, 0, buf_size, &iova2); + + test_cmd_revoke_dmabuf(dfd, false); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova2); + + /* Restore the iova back */ + test_ioctl_ioas_unmap(iova, buf_size); + test_ioctl_ioas_map_fixed_file(dfd, 0, buf_size, iova); + + close(dfd); +} + FIXTURE(iommufd_mock_domain) { int fd; diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9f472c20c1905..55c285dbbf526 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -548,6 +548,39 @@ static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id, EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \ self->fd, access_id, access_pages_id)) +static int _test_cmd_get_dmabuf(int fd, size_t len, int *out_fd) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_GET, + .dmabuf_get = { .length = len, .open_flags = O_CLOEXEC }, + }; + + *out_fd = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (*out_fd < 0) + return -1; + return 0; +} +#define test_cmd_get_dmabuf(len, out_fd) \ + ASSERT_EQ(0, _test_cmd_get_dmabuf(self->fd, len, out_fd)) + +static int _test_cmd_revoke_dmabuf(int fd, int dmabuf_fd, bool revoked) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_REVOKE, + .dmabuf_revoke = { .dmabuf_fd = dmabuf_fd, .revoked = revoked }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret < 0) + return -1; + return 0; +} +#define test_cmd_revoke_dmabuf(dmabuf_fd, revoke) \ + ASSERT_EQ(0, _test_cmd_revoke_dmabuf(self->fd, dmabuf_fd, revoke)) + static int _test_ioctl_destroy(int fd, unsigned int id) { struct iommu_destroy cmd = { @@ -718,6 +751,17 @@ static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd, self->fd, ioas_id, mfd, start, length, iova_p, \ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) +#define test_ioctl_ioas_map_fixed_file(mfd, start, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, \ + length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) { struct iommu_test_cmd memlimit_cmd = {